"""
Copyright 2015-2023 Stephane De Mita, Mathieu Siol
This file is part of EggLib.
EggLib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EggLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with EggLib. If not, see <http://www.gnu.org/licenses/>.
"""
import math
from .. import eggwrapper as _eggwrapper
from .. import _interface
from .. import _freq
from .. import _site
from .. io import _vcf
[docs]class ComputeStats(object):
"""ComputeStats(...)
Customizable and efficient analysis of diversity
data. This class is designed to minimize redundancy of underlying analyses
so it is best to compute as many statistics as possible with a
single instance. It also takes advantage of the object reuse policy,
improving the efficiency of analyses when several (and especially
many) datasets are examined in a row by the same instance of
:class:`~.ComputeStats`.
The constructor takes arguments that are automatically passed to the
:meth:`configure` method.
Statistics to compute are set using the method :meth:`~.ComputeStats.add_stats`
which allows specifying several statistics at once and which can
also be called several times to add more statistics to compute.
.. versionadded 3.3:
Added three-population site configuration statistic (``triconfig``).
"""
[docs] def list_stats(self):
"""
List of available statistics. Returns a list of tuples giving, for each
available statistic, its code and a short description.
"""
return [(k, v[0]) for (k, v) in self._stats.items()]
def _get_genosite_from_store(self):
if len(self._genosite_store): return self._genosite_store.pop()
else: return _eggwrapper.Genotypes()
def __init__(self, *args, **kwargs):
self._sd = _eggwrapper.SiteDiversity()
self._sd2 = _eggwrapper.SiteDiversity()
self._as = _eggwrapper.AlleleStatus()
self._d1 = _eggwrapper.Diversity1()
self._d2 = _eggwrapper.Diversity2()
self._h = _eggwrapper.Haplotypes()
self._rd = _eggwrapper.Rd()
self._ld = _eggwrapper.MatrixLD()
self._cv = _eggwrapper.ComputeV()
self._frq = _eggwrapper.FreqHolder()
self._frq2 = _eggwrapper.FreqHolder()
self._site = _eggwrapper.SiteHolder()
self._genosite = _eggwrapper.Genotypes()
self._genosite_cache = []
self._genosite_store = []
self._struct = None # structure (C++) passed as argument to configure()
self._struct_auto = True
self._triconfig = _eggwrapper.Triconfigurations()
self._stats = {
# code # description # flag # toggler
'ns_site': ('Number of analyzed samples per site', 'sd', []),
'ns_site_o': ('Number of analyzed outgroup samples per site', 'sd', []),
'Aing': ('Number of alleles in ingroup', 'sd', []),
'Aotg': ('Number of alleles in outgroup', 'sd', []),
'Atot': ('Number of alleles in whole dataset', 'sd', []),
'As': ('Number of singleton alleles', 'sd', []),
'Asd': ('Number of singleton alleles (derived)', 'sd', []),
'R': ('Allelic richness', 'sd', []),
'thetaIAM': ('Theta estimator based on He & IAM model', 'sd', []),
'thetaSMM': ('Theta estimator based on He & SMM model', 'sd', []),
'He': ('Expected heterozygosity', 'sd', []),
'Ho': ('Observed heterozygosity', 'sd', []),
'Fis': ('Inbreeding coefficient', 'sd', []),
'maf': ('Relative frequency of the minority allele', 'sd', []),
'maf_pop': ('maf per population (minority allele overall)', 'sd', []),
'FstWC': ('Weir and Cockerham for haploid data', 'sd', [self._sd.toggle_fstats_haplo]),
'FistWC': ('Weir and Cockerham for diploid data', 'sd', [self._sd.toggle_fstats_diplo]),
'FisctWC': ('Weir and Cockerham for hierarchical structure', 'sd', [self._sd.toggle_fstats_hier]),
'Dj': ('Jost\'s D', 'sd', [self._sd.toggle_hstats]),
'Hst': ('Hudson\'s Hst', 'sd', [self._sd.toggle_hstats]),
'Gst': ('Nei\'s Gst', 'sd', [self._sd.toggle_hstats]),
'Gste': ('Hedrick\'s Gst\'', 'sd', [self._sd.toggle_hstats]),
'numSp': ('Number of population-specific alleles', 'as', []),
'numSpd': ('Number of population-specific derived alleles', 'as', []),
'numShA': ('Number of shared alleles', 'as', []),
'numShP': ('Number of shared segregating alleles', 'as', []),
'numFxA': ('Number of fixed alleles', 'as', []),
'numFxD': ('Number of fixed differences', 'as', []),
'numSp*': ('Sites with at least 1 pop-specific allele', 'as', []),
'numSpd*': ('Sites with at least 1 pop-specific derived allele', 'as', []),
'numShA*': ('Sites with at least 1 shared allele', 'as', []),
'numShP*': ('Sites with at least 1 shared segregating allele', 'as', []),
'numFxA*': ('Sites with at least 1 fixed allele', 'as', []),
'numFxD*': ('Sites with at least 1 fixed difference', 'as', []),
'triconfig': ('Site configurations with three populations', 'tf', []),
'lseff': ('Number of analysed sites', 'd1', []),
'nsmax': ('Maximal number of available samples per site', 'd1', []),
'S': ('Number of segregating sites', 'd1', []),
'Ss': ('Number of sites with only one singleton allele', 'd1', []),
'eta': ('Minimal number of mutations', 'd1', []),
'Pi': ('Nucleotide diversity', 'd1', []),
'sites': ('Index of polymorphic sites', 'd1', [self._d1.toggle_site_lists]),
'singl': ('Index of sites with at least one singleton allele', 'd1', [self._d1.toggle_site_lists]),
'nall': ('Number of ingroup alleles at polymorphic sites', 'd1', [self._d1.toggle_site_lists]),
'frq': ('Allelic frequencies at polymorphic sites', 'd1', [self._d1.toggle_site_lists]),
'frqp': ('Population allelic frequencies at polymorphic sites', 'd1', [self._d1.toggle_site_lists]),
'lseffo': ('Number of analysed orientable sites', 'd1', [self._d1.toggle_ori_site]),
'nsmaxo': ('Maximal number of available samples per orientable site', 'd1', [self._d1.toggle_ori_site]),
'sites_o': ('Index of orientable polymorphic sites', 'd1', [self._d1.toggle_site_lists, self._d1.toggle_ori_site]),
'singl_o': ('Index of sites with at least one singleton allele', 'd1', [self._d1.toggle_site_lists, self._d1.toggle_ori_site]),
'So': ('Number of segregating orientable sites', 'd1', [self._d1.toggle_ori_site]),
'Sso': ('Number of orientable sites with only one singleton allele', 'd1', [self._d1.toggle_ori_site]),
'nsingld': ('Number of derived singletons', 'd1', [self._d1.toggle_ori_site]),
'etao': ('Minimal number of mutations are orientable sites', 'd1', [self._d1.toggle_ori_site]),
'nM': ('Number of sites available for MFDM test', 'd1', [self._d1.toggle_ori_site]),
'pM': ('P-value of MDFM test', 'd1', [self._d1.toggle_ori_site]),
'nseffo': ('Average number of exploitable samples at orientable sites', 'd1', [self._d1.toggle_ori_div]),
'thetaPi': ('Pi using orientable sites', 'd1', [self._d1.toggle_ori_div]),
'thetaH': ('Fay and Wu\'s estimator of theta', 'd1', [self._d1.toggle_ori_div]),
'thetaL': ('Zeng et al.\'s estimator of theta', 'd1', [self._d1.toggle_ori_div]),
'Hns': ('Fay and Wu\'s H (unstandardized)', 'd1', [self._d1.toggle_ori_div]),
'Hsd': ('Fay and Wu\'s H (standardized)', 'd1', [self._d1.toggle_ori_div]),
'E': ('Zeng et al.\'s E', 'd1', [self._d1.toggle_ori_div]),
'Dfl': ('Fu and Li\'s D', 'd1', [self._d1.toggle_ori_div]),
'F': ('Fu and Li\'s F', 'd1', [self._d1.toggle_ori_div]),
'nseff': ('Average number of exploitable samples', 'd1', [self._d1.toggle_basic]),
'thetaW': ('Watterson\'s estimator of theta', 'd1', [self._d1.toggle_basic]),
'Dxy': ('Pairwise distance (if two populations)', 'd1', [self._d1.toggle_basic]),
'Da': ('Net pairwise distance (if two populations)', 'd1', [self._d1.toggle_basic]),
'D': ('Tajima\'s D', 'd1', [self._d1.toggle_basic]),
'Deta': ('Tajima\'s D using eta instead of S', 'd1', [self._d1.toggle_basic]),
'D*': ('Fu and Li\'s D*', 'd1', [self._d1.toggle_basic]),
'F*': ('Fu and Li\'s F*', 'd1', [self._d1.toggle_basic]),
'R2': ('Ramos-Onsins and Rozas\'s R2 (using singletons)', 'd2', [self._d2.toggle_singletons]),
'R3': ('Ramos-Onsins and Rozas\'s R3 (using singletons)', 'd2', [self._d2.toggle_singletons]),
'R4': ('Ramos-Onsins and Rozas\'s R4 (using singletons)', 'd2', [self._d2.toggle_singletons]),
'Ch': ('Ramos-Onsins and Rozas\'s Ch (using singletons)', 'd2', [self._d2.toggle_singletons]),
'R2E': ('Ramos-Onsins and Rozas\'s R2E (using external singletons)', 'd2', [self._d2.toggle_singletons]),
'R3E': ('Ramos-Onsins and Rozas\'s R3E (using external singletons)', 'd2', [self._d2.toggle_singletons]),
'R4E': ('Ramos-Onsins and Rozas\'s R4E (using external singletons)', 'd2', [self._d2.toggle_singletons]),
'ChE': ('Ramos-Onsins and Rozas\'s ChE (using external singletons)', 'd2', [self._d2.toggle_singletons]),
'B': ('Wall\'s B statistic', 'd2', [self._d2.toggle_partitions]),
'Q': ('Wall\'s Q statistic', 'd2', [self._d2.toggle_partitions]),
'Kt': ('Number of haplotypes', 'h', []),
'Ki': ('Number of haplotypes (only ingroup)', 'h', []),
'FstH': ('Hudson\'s Fst', 'h', [self._toggle_haplotype_stats]),
'Kst': ('Hudson\'s Kst', 'h', [self._toggle_haplotype_stats]),
'Snn': ('Hudson\'s nearest nearest neighbour statistic', 'h', [self._toggle_haplotype_stats]),
'rD': ('R_bar{d} statistic', 'rD', []),
'Rmin': ('Minimal number of recombination events', 'LD', [self._ld.toggle_Rmin]),
'RminL': ('Number of sites used to compute Rmin', 'LD', [self._ld.toggle_Rmin]),
'Rintervals': ('List of start/end positions of recombination intervals', 'LD', [self._ld.toggle_Rmin]),
'nPairs': ('Number of allele pairs used for ZnS, Z*nS, and Z*nS*', 'LD', [self._ld.toggle_stats]),
'nPairsAdj': ('Allele pairs at adjacent sites (used for ZZ and Za)', 'LD', [self._ld.toggle_stats]),
'ZnS': ('Kelly et al.\'s ZnS', 'LD', [self._ld.toggle_stats]),
'Z*nS': ('Kelly et al.\'s Z*nS', 'LD', [self._ld.toggle_stats]),
'Z*nS*': ('Kelly et al.\'s Z*nS*', 'LD', [self._ld.toggle_stats]),
'Za': ('Rozas et al.\'s Za', 'LD', [self._ld.toggle_stats]),
'ZZ': ('Rozas et al.\'s ZZ', 'LD', [self._ld.toggle_stats]),
'Fs': ('Fu\'s Fs', 'Fs', []),
'V': ('Allele size variance', 'V', []),
'Ar': ('Allele range', 'V', []),
'M': ('Garza and Williamson (2001)\'s M = Aing / (Ar+1)', 'V', []),
'Rst': ('Slatkin (1995)\'s Rst', 'V', [])
}
self.clear_stats()
self.configure(*args, **kwargs)
[docs] def reset(self):
"""
Reset all currently computed statistics. Keep the list of
statistics to compute.
"""
self._sd.reset()
self._sd2.reset()
self._as.reset()
self._d1.reset_stats()
self._d2.reset_stats()
self._h.reset_stats()
self._rd.reset_stats()
self._ld.reset()
self._cv.reset()
self._triconfig.reset()
self._site_index = 0
self._static_sites = True # set to False if sites cannot be considered to be static (required for LD stats)
self._genosite_store.extend(self._genosite_cache)
del self._genosite_cache[:]
if self._struct_auto: self._struct = None
[docs] def set_structure(self, struct):
"""
Set the structure object. Reset statistics but don't change the
list of statistics to compute and don't change over parameter
values.
"""
self.reset()
if struct is None:
self._struct_auto = True
self._struct = None
self._aux = None
else:
self._struct_auto = False
self._struct = struct._obj
self._aux = struct.make_auxiliary()._obj
self._aux_sorted = struct.make_sorted_auxiliary()._obj
self._h.setup(struct._obj)
self._rd.configure(struct._obj)
self._d2.set_structure(self._aux)
self._ld.set_structure(self._aux)
def _check_struct(self, ns):
if self._struct_auto or self._struct is None:
struct = _eggwrapper.StructureHolder()
struct.mk_dummy_structure(ns, 1)
if self._struct is None: self._h.setup(struct)
else: self._h.set_structure(struct) # does not reset stats
self._d2.set_structure(struct)
self._rd.configure(struct)
self._ld.set_structure(struct) # no difference between struct and aux
self._struct = struct
self._aux = struct # identical because no actual structure
self._aux_sorted = struct # idem
else:
if self._struct.get_req() > ns: raise ValueError('structure does not match number of samples')
[docs] def add_stats(self, *stats):
"""
add_stats(stat, ...)
Add one or more statistics to compute. Every statistic identifier must be among
the list of available statistics, regardless of what data is to
be analyzed. If statistics cannot be computed, they will be
returned as ``None``.
"""
for stat in stats:
if stat not in self._stats:
raise ValueError('invalid statistic: {0}'.format(stat))
self._wanted_stats.add(stat)
self._flags[self._stats[stat][1]] = True;
for f in self._stats[stat][2]: f()
if self._flags['Fs']:
self._flags['d1'] = True
self._flags['h'] = True
if (self._flags['as'] or self._flags['d1'] or self._flags['d2']
or self._flags['rD'] or self._flags['LD']
or self._flags['h']):
self._flags['sd'] = True
[docs] def all_stats(self):
"""
Add all possible statistics. Those who cannot be computed will
be reported as ``None``. Also reset all currently computed statistics (if any).
"""
self.add_stats(*self._stats)
[docs] def clear_stats(self):
"""
Clear the list of statistics to compute.
"""
self._wanted_stats = set()
self._flags = dict.fromkeys([self._stats[k][1] for k in self._stats], False)
self._sd.toggle_off()
self._sd2.toggle_off()
self._d1.toggle_off()
self._ld.toggle_off()
self._toggle_off()
def _toggle_off(self):
self._flag_haplotype_stats = False
def _toggle_haplotype_stats(self):
self._flag_haplotype_stats = True
[docs] def process_freq(self, frq, position=None):
"""
Analyze already computed frequencies.
:param frq: a :class:`.Freq` instance.
:param position: position of the site. Must be an integer value.
By default, use the site loading index.
:return: A dictionary of statistics, unless *multi* is set.
"""
if not self._multi:
stats = dict.fromkeys(self._wanted_stats, None)
if position is None: position = self._site_index
if self._flags['tf'] == True:
self._triconfig.process(frq._obj)
if self._flags['sd'] == True:
res = self._sd.process(frq._obj)
if not self._multi: self._get_sd_stats(res, stats)
if self._flags['as'] == True and (res & 1024) != 0 and self._sd.npop_eff1() > 1:
self._as.process(frq._obj)
if not self._multi: self._get_as_stats(stats)
if self._flags['d1'] == True and (res & 2) != 0:
self._d1.load(frq._obj, self._sd, position)
if self._flags['V'] == True:
if frq._alphabet.type not in ['int', 'range']: raise ValueError('cannot compute V, Ar, M, and Rst with this alphabet')
b = self._cv.compute(frq._obj, frq._alphabet._obj)
if not self._multi and b:
if 'V' in stats: stats['V'] = self._cv.curr_V()
if 'Ar' in stats: stats['Ar'] = self._cv.curr_Ar()
if 'M' in stats and self._cv.curr_M() != _eggwrapper.UNDEF: stats['M'] = self._cv.curr_M()
if 'Rst' in stats and self._cv.curr_Rst() != _eggwrapper.UNDEF: stats['Rst'] = self._cv.curr_Rst()
self._site_index += 1
if not self._multi: return stats
[docs] def process_site(self, site, position=None):
"""
Analyze a site.
:param site: a :class:`.Site` instance.
:param position: position of the site. Must be an integer value.
By default, use the site index.
:return: A dictionary of statistics, unless *multi* is set.
"""
self._check_struct(site._obj.get_ns())
if position is None: position = self._site_index
if not self._multi: stats = dict.fromkeys(self._wanted_stats, None)
if self._flags['d2'] or self._flags['h']: # add LD for process_sites/process_align
self._genosite.process(site._obj, self._struct)
if self._flags['d2']:
self._frq2.setup_structure(self._aux)
self._frq2.process_site(self._genosite.site())
self._sd2.process(self._frq2)
self._frq.setup_structure(self._struct)
self._frq.process_site(site._obj)
if self._flags['tf'] == True:
self._triconfig.process(self._frq)
if self._flags['sd'] == True:
res = self._sd.process(self._frq)
if not self._multi: self._get_sd_stats(res, stats)
if self._flags['as'] == True and (res & 1024) != 0 and self._sd.npop_eff1() > 1:
self._as.process(self._frq)
if not self._multi: self._get_as_stats(stats)
if self._flags['d1'] == True and (res & 2) != 0:
self._d1.load(self._frq, self._sd, position)
if self._flags['d2'] == True and (res & 2) != 0:
self._d2.load(self._genosite.site(), self._sd2, self._frq2)
if self._flags['h'] == True and self._sd.Aglob() > 1 and (self._sd.Aing() == 2 or self._multi_hits == True):
self._h.load(self._genosite)
if self._flags['rD'] == True:
self._rd.load(site._obj)
if self._flags['V'] == True:
if site._alphabet.type not in ['int', 'range']: raise ValueError('cannot compute V, Ar, M, and Rst with this alphabet')
b = self._cv.compute(self._frq, site._alphabet._obj)
if not self._multi and b:
if 'V' in stats: stats['V'] = self._cv.curr_V()
if 'Ar' in stats: stats['Ar'] = self._cv.curr_Ar()
if 'M' in stats and self._cv.curr_M() != _eggwrapper.UNDEF: stats['M'] = self._cv.curr_M()
if 'Rst' in stats and self._cv.curr_Rst() != _eggwrapper.UNDEF: stats['Rst'] = self._cv.curr_Rst()
self._site_index += 1
if self._multi: self._static_sites = False
else: return stats
[docs] def process_sites(self, sites, positions=None):
"""
Analyze a list of sites.
:param sites: a list, or other sequence of :class:`.Site`
instance. (:class:`.VcfWindow` instances
with a number of sites greater than 0 are also supported.)
:param positions: a list, or other sequence of positions for all
sites, or ``None``. If ``None``, use the index of each site. Otherwise,
must be a sequences of integer values (length matching the
length of *sites*).
:return: A dictionary of statistics, unless *multi* is set.
"""
if positions is None:
positions = (self._site_index + i for i in range(len(sites)))
elif len(positions) != len(sites):
raise ValueError('number of positions must match the number of sites')
if len(sites) > 0:
for site, pos in zip(sites, positions):
self._check_struct(site._obj.get_ns())
self._frq.setup_structure(self._struct)
self._frq.process_site(site._obj)
keep = False
if self._flags['d2'] or self._flags['h'] or self._flags['LD']:
self._genosite.process(site._obj, self._struct)
if self._flags['d2']:
self._frq2.setup_structure(self._aux)
self._frq2.process_site(self._genosite.site())
self._sd2.process(self._frq2)
if self._flags['tf'] == True:
self._triconfig.process(self._frq)
if self._flags['sd'] == True: res = self._sd.process(self._frq) # don't require consistency
if self._flags['as'] == True and (res & 1024) != 0 and self._sd.npop_eff1() > 1: self._as.process(self._frq) # don't require consistency
if self._flags['d1'] == True and (res & 2) != 0: self._d1.load(self._frq, self._sd, pos) # don't require consistency
if self._flags['d2'] == True and (res & 2) != 0:
self._d2.load(self._genosite.site(), self._sd2, self._frq2)
if self._flags['h'] == True and self._sd.Aglob() > 1 and (self._multi_hits == True or self._sd.Aing() < 3):
self._h.load(self._genosite)
keep |= True
if self._flags['rD'] == True and (res & 2) != 0 and self._sd.Aing() > 1: self._rd.load(site._obj)
if self._flags['LD'] == True and (res & 2) != 0 and (self._sd.Aing() == 2 or (self._sd.Aing() > 2 and self._LD_multiallelic != _eggwrapper.MatrixLD.ignore)):
keep |= True
self._ld.load(self._genosite, pos)
if self._flags['V'] == True:
if site._alphabet.type not in ['int', 'range']: raise ValueError('cannot compute V, Ar, M, and Rst with this alphabet')
self._cv.compute(self._frq, site._alphabet._obj)
# if need to keep site, put it in cache and claim new
if keep:
self._genosite_cache.append(self._genosite)
self._genosite = self._get_genosite_from_store()
self._site_index += len(sites)
if self._multi: self._static_sites = False # prevent attempting using sites (which may be invalidated)
else: return self.results()
[docs] def process_align(self, align, positions=None, max_missing=0.0):
"""
Analyze an alignment.
:param align: an :class:`.Align` instance.
:param positions: a list, or other sequence of positions for all
sites, or ``None``. If ``None``, use the index of each site. Otherwise,
must be a sequences of integer values (length matching the
number of sites).
:param max_missing: Maximum relative proportion of missing data (excluding
outgroup). The default is to exclude all sites with missing data. Missing
data include all ambiguity characters and alignment gaps.
Sites not passing this threshold are ignored for all
analyses.
.. note::
Here, *max_missing* is a relative proportion to allow
using the same value for different alignments that might
not have the same number of samples (to avoid reevaluating
*max_missing* if the user wants the same maximum rate
of missing data). In other functions, *max_missing* is
the maximum *number* of missing and is an integer.
:return: A dictionary of statistics, unless *multi* is set.
"""
if not isinstance(align, _interface.Align): raise TypeError('expect an Align instance')
# check max_missing
max_missing = float(max_missing)
if max_missing < 0.0 or max_missing > 1.0:
raise ValueError('invalid value for `max_missing` argument: {0}'.format(max_missing))
# get number of sites and positions
ls = align._obj.get_nsit_all()
if positions is None:
positions = (self._site_index + i for i in range(ls))
elif len(positions) != ls:
raise ValueError('number of positions must match the number of sites')
self._check_struct(align._obj.get_nsam())
ns = self._struct.get_ni()
no = self._struct.get_no()
# get max_missing number
min_valid = math.floor(ns - max_missing * ns)
# process sites
for idx, pos in enumerate(positions):
self._site.reset()
if self._site.process_align(align._obj, idx, self._struct) < min_valid:
continue
if self._flags['d2'] or self._flags['h'] or self._flags['LD']:
self._genosite.process(self._site, self._aux_sorted)
if self._flags['d2']:
self._frq2.setup_structure(self._aux)
self._frq2.process_site(self._genosite.site())
self._sd2.process(self._frq2)
self._frq.setup_structure(self._aux_sorted)
self._frq.process_site(self._site)
keep = False
if self._flags['tf'] == True: self._triconfig.process(self._frq)
if self._flags['sd'] == True: res = self._sd.process(self._frq)
if self._flags['as'] == True and (res & 1024) != 0 and self._sd.npop_eff1() > 1: self._as.process(self._frq)
if self._flags['d1'] == True and (res & 2) != 0: self._d1.load(self._frq, self._sd, pos)
if self._flags['d2'] == True and (res & 2) != 0:
self._d2.load(self._genosite.site(), self._sd2, self._frq2)
if self._flags['h'] == True and self._sd.Aglob() > 1 and (self._multi_hits == True or self._sd.Aing() < 3):
self._h.load(self._genosite)
keep |= True
if self._flags['rD'] == True and (res & 2) != 0 and self._sd.Aing() > 1:
self._rd.load(self._site) # Rd does not assume that data are phased
if self._flags['LD'] == True and (res & 2) != 0 and (self._sd.Aing() == 2 or (self._sd.Aing() > 2 and self._LD_multiallelic != _eggwrapper.MatrixLD.ignore)):
self._ld.load(self._genosite, pos)
keep |= True
if self._flags['V'] == True:
if align._alphabet.type not in ['int', 'range']: raise ValueError('cannot compute V, Ar, M, and Rst with this alphabet')
self._cv.compute(self._frq, align._alphabet._obj)
# if need to keep site, put it in cache and claim new
if keep:
self._genosite_cache.append(self._genosite)
self._genosite = self._get_genosite_from_store()
self._site_index += ls
if not self._multi: return self.results()
[docs] def results(self):
"""
Get computed statistics.
Return the value of statistics for all sites since the last call
to this method, to :meth:`reset`, or any addition of statistics,
or the object creation,
whichever is most recent. For statistics that can not be
computed, ``None`` is returned.
"""
stats = dict.fromkeys(self._wanted_stats, None)
if self._flags['tf'] == True:
stats['triconfig'] = [self._triconfig.cnt(i) for i in range(13)]
if self._flags['sd'] == True:
res = self._sd.average()
self._get_sd_stats(res, stats)
if self._flags['as'] == True:
self._as.total()
if (res & 1024) != 0 and self._sd.npop_eff1() > 1:
self._get_as_stats(stats)
if self._as.nsites() > 0:
if 'numSp*' in stats: stats['numSp*'] = self._as.Sp_T1()
if 'numSpd*' in stats and self._as.nsites_o() > 0: stats['numSpd*'] = self._as.Spd_T1()
if 'numShP*' in stats: stats['numShP*'] = self._as.ShP_T1()
if 'numShA*' in stats: stats['numShA*'] = self._as.ShA_T1()
if 'numFxD*' in stats: stats['numFxD*'] = self._as.FxD_T1()
if 'numFxA*' in stats: stats['numFxA*'] = self._as.FxA_T1()
if self._flags['d1'] == True:
res = self._d1.compute()
if 'lseff' in stats: stats['lseff'] = self._d1.ls()
if 'lseffo' in stats: stats['lseffo'] = self._d1.lso()
if (res & 1) != 0:
if 'nsmax' in stats: stats['nsmax'] = self._d1.nsmax()
if 'S' in stats: stats['S'] = self._d1.S()
if 'Ss' in stats: stats['Ss'] = self._d1.Ss()
if 'sites' in stats: stats['sites'] = [self._d1.site(i) for i in range(self._d1.S())]
if 'singl' in stats: stats['singl'] = [self._d1.singl(i) for i in range(self._d1.Ss())]
if 'nall' in stats: stats['nall'] = [self._d1.nall(i) for i in range(self._d1.S())]
if 'frq' in stats: stats['frq'] = [[self._d1.frq(i, j) for j in range(self._d1.nall(i))] for i in range(self._d1.S())]
if 'frqp' in stats: stats['frqp'] = [[[self._d1.frqp(i, j, k) for k in range(self._struct.num_pop())] for j in range(self._d1.nall(i))] for i in range(self._d1.S())]
if 'eta' in stats: stats['eta'] = self._d1.eta()
if 'Pi' in stats: stats['Pi'] = self._d1.Pi()
if (res & 8) != 0:
if 'nsmaxo' in stats: stats['nsmaxo'] = self._d1.nsmaxo()
if 'So' in stats: stats['So'] = self._d1.So()
if 'Sso' in stats: stats['Sso'] = self._d1.Sso()
if 'nsingld' in stats: stats['nsingld'] = self._d1.nsingld()
if 'sites_o' in stats: stats['sites_o'] = [self._d1.site_o(i) for i in range(self._d1.So())]
if 'singl_o' in stats: stats['singl_o'] = [self._d1.singl_o(i) for i in range(self._d1.Sso())]
if 'etao' in stats: stats['etao'] = self._d1.etao()
if 'nM' in stats: stats['nM'] = self._d1.nM()
if (res & 16) != 0:
if 'pM' in stats: stats['pM'] = self._d1.pM()
if (res & 32) != 0:
if 'nseffo' in stats: stats['nseffo'] = self._d1.nseffo()
if 'thetaPi' in stats: stats['thetaPi'] = self._d1.thetaPi()
if 'thetaH' in stats: stats['thetaH'] = self._d1.thetaH()
if 'thetaL' in stats: stats['thetaL'] = self._d1.thetaL()
if 'Hns' in stats: stats['Hns'] = self._d1.Hns()
if (res & 1024) != 0:
if 'Hsd' in stats: stats['Hsd'] = self._d1.Hsd()
if (res & 2048) != 0:
if 'E' in stats: stats['E'] = self._d1.E()
if (res & 4096) != 0:
if 'Dfl' in stats: stats['Dfl'] = self._d1.Dfl()
if (res & 8192) != 0:
if 'F' in stats: stats['F'] = self._d1.F()
if (res & 128) != 0:
if 'nseff' in stats: stats['nseff'] = self._d1.nseff()
if 'thetaW' in stats: stats['thetaW'] = self._d1.thetaW()
if (res & 16384) != 0:
if 'Dxy' in stats: stats['Dxy'] = self._d1.Dxy()
if 'Da' in stats: stats['Da'] = self._d1.Da()
if (res & 256) != 0:
if 'F*' in stats: stats['F*'] = self._d1.Fstar()
if (res & 512) != 0:
if 'D' in stats: stats['D'] = self._d1.D()
if 'Deta' in stats: stats['Deta'] = self._d1.Deta()
if 'D*' in stats: stats['D*'] = self._d1.Dstar()
if self._flags['d2'] == True:
res = self._d2.compute()
if (res & 1) == 0:
if (res & 256) != 0:
if 'R2' in stats: stats['R2'] = self._d2.R2()
if 'R3' in stats: stats['R3'] = self._d2.R3()
if 'R4' in stats: stats['R4'] = self._d2.R4()
if 'Ch' in stats: stats['Ch'] = self._d2.Ch()
if (res & 512) != 0:
if 'R2E' in stats: stats['R2E'] = self._d2.R2E()
if 'R3E' in stats: stats['R3E'] = self._d2.R3E()
if 'R4E' in stats: stats['R4E'] = self._d2.R4E()
if 'ChE' in stats: stats['ChE'] = self._d2.ChE()
if (res & 1024) != 0:
if 'B' in stats: stats['B'] = self._d2.B()
if 'Q' in stats: stats['Q'] = self._d2.Q()
if self._flags['h'] == True and self._h.n_sites() > 0 and not self._h.invalid():
self._h.cp_haplotypes()
if 'Kt' in stats and (self._h.ne_ing() > 0 or self._h.ne_otg() > 0): stats['Kt'] = self._h.ng_hapl()
if self._h.ne_ing() > 0 and 'Ki' in stats: stats['Ki'] = self._h.ni_hapl()
if self._flag_haplotype_stats:
self._h.cp_dist()
res = self._h.cp_stats()
if 'FstH' in stats and res & 1 != 0: stats['FstH'] = self._h.Fst()
if 'Kst' in stats and res & 2 != 0: stats['Kst'] = self._h.Kst()
if 'Snn' in stats:
stats['Snn'] = self._h.Snn()
if stats['Snn'] == _eggwrapper.UNDEF: stats['Snn'] = None
if 'Fs' in stats and self._h.n_sites() and self._h.ni_hapl() > 0:
stats['Fs'] = _eggwrapper.Fs(self._h.ne_ing(), self._h.ni_hapl(), self._d1.Pi())
if self._static_sites == True and 'rD' in stats:
stats['rD'] = self._rd.compute()
if stats['rD'] == _eggwrapper.UNDEF: stats['rD'] = None
if self._static_sites == True and self._flags['LD'] == True:
flag = self._ld.process(self._LD_min_n, self._LD_max_maj,
self._LD_multiallelic, self._LD_min_freq,
self._Rmin_oriented)
if 'nPairs' in stats: stats['nPairs'] = self._ld.num_allele_pairs()
if 'nPairsAdj' in stats: stats['nPairsAdj'] = self._ld.num_allele_pairs_adj()
if 'RminL' in stats: stats['RminL'] = self._ld.Rmin_num_sites()
if (flag&1) != 0:
if 'ZnS' in stats: stats['ZnS'] = self._ld.ZnS()
if 'Z*nS' in stats: stats['Z*nS'] = self._ld.ZnS_star1()
if 'Z*nS*' in stats: stats['Z*nS*'] = self._ld.ZnS_star2()
if (flag&2) != 0:
if 'Za' in stats: stats['Za'] = self._ld.Za()
if 'ZZ' in stats: stats['ZZ'] = self._ld.ZZ()
if (flag&4) != 0:
if 'Rmin' in stats: stats['Rmin'] = self._ld.Rmin()
if 'Rintervals' in stats: stats['Rintervals'] = [(self._ld.Rmin_left(i), self._ld.Rmin_right(i)) for i in range(self._ld.Rmin())]
if self._flags['V']:
if self._cv.num_sites() > 0:
if 'V' in stats: stats['V'] = self._cv.average_V()
if 'Ar' in stats: stats['Ar'] = self._cv.average_Ar()
if 'M' in stats and self._cv.num_sites_m() > 0: stats['M'] = self._cv.average_M()
if 'Rst' in stats and self._cv.average_Rst() != _eggwrapper.UNDEF: stats['Rst'] = self._cv.average_Rst()
self.reset()
return stats
def _get_sd_stats(self, flag, stats):
if (flag & 1) != 0 and 'ns_site' in stats:
if (flag & 512) != 0: stats['ns_site'] = int(self._sd.ns())
else: stats['ns_site'] = self._sd.ns()
if (flag & 1) != 0 and 'ns_site_o' in stats:
if (flag & 512) != 0: stats['ns_site_o'] = int(self._sd.nso())
else: stats['ns_site_o'] = self._sd.nso()
if (flag & 2) != 0:
if 'Aing' in stats:
if (flag & 512) != 0: stats['Aing'] = int(self._sd.Aing())
else: stats['Aing'] = self._sd.Aing()
if 'Atot' in stats:
if (flag & 512) != 0: stats['Atot'] = int(self._sd.Aglob())
else: stats['Atot'] = self._sd.Aglob()
if 'As' in stats:
if (flag & 512) != 0: stats['As'] = int(self._sd.S())
else: stats['As'] = self._sd.S()
if 'R' in stats: stats['R'] = self._sd.R()
if 'He' in stats: stats['He'] = self._sd.He()
if (flag & 4) != 0:
if 'thetaIAM' in stats: stats['thetaIAM'] = self._sd.thetaIAM()
if 'thetaSMM' in stats: stats['thetaSMM'] = self._sd.thetaSMM()
if (flag & 8) != 0:
if 'Ho' in stats: stats['Ho'] = self._sd.Ho()
if 'Fis' in stats and self._sd.He() > 0.0:
stats['Fis'] = 1 - self._sd.Ho() / self._sd.He()
if (flag & 16) != 0:
if 'Asd' in stats:
if (flag & 512) != 0: stats['Asd'] = int(self._sd.Sd())
else: stats['Asd'] = self._sd.Sd()
if (flag & 16384) != 0:
if 'Aotg' in stats:
if (flag & 512) != 0: stats['Aotg'] = int(self._sd.Aout())
else: stats['Aotg'] = self._sd.Aout()
if (flag & 32) != 0 and 'FstWC' in stats:
n = self._sd.n()
d = self._sd.d()
stats['FstWC'] = n/d if d > 0.0 else None
if (flag & 64) != 0 and 'FistWC' in stats and self._struct.get_ploidy() > 1:
a = self._sd.a()
b = self._sd.b()
c = self._sd.c()
stats['FistWC'] = ( 1.0 - c/(b+c) if (b+c) > 0.0 else None,
a/(a+b+c), 1.0 - c/(a+b+c)) if (a+b+c) > 0.0 else None
if (flag & 128) != 0 and 'FisctWC' in stats and self._struct.get_ploidy() > 1:
a0 = self._sd.a0()
b2 = self._sd.b2()
b1 = self._sd.b1()
c0 = self._sd.c0()
stats['FisctWC'] = ( 1.0 - c0/(b1+c0) if (b1+c0) > 0.0 else None,
(a0+b2) / (a0+b2+b1+c0), a0/(a0+b2+b1+c0),
1.0 - c0/(a0+b2+b1+c0)) if (a0+b2+b1+c0) > 0.0 else None
if (flag & 256) != 0:
if 'Dj' in stats: stats['Dj'] = self._sd.D()
if 'Hst' in stats and (flag & 2048) != 0: stats['Hst'] = self._sd.Hst()
if 'Gst' in stats and (flag & 4096) != 0: stats['Gst'] = self._sd.Gst()
if 'Gste' in stats and (flag & 8192) != 0: stats['Gste'] = self._sd.Gste()
if (flag & 1024) != 0:
if 'maf' in stats: stats['maf'] = self._sd.maf()
if 'maf_pop' in stats: stats['maf_pop'] = [self._sd.maf_pop(i) for i in range(self._sd.num_pop())]
def _get_as_stats(self, stats):
if self._as.nsites() > 0:
if 'numSp' in stats: stats['numSp'] = self._as.Sp()
if 'numSpd' in stats and self._as.nsites_o() > 0: stats['numSpd'] = self._as.Spd()
if 'numShP' in stats: stats['numShP'] = self._as.ShP()
if 'numShA' in stats: stats['numShA'] = self._as.ShA()
if 'numFxD' in stats: stats['numFxD'] = self._as.FxD()
if 'numFxA' in stats: stats['numFxA'] = self._as.FxA()