Source code for egglib.stats._sfs

"""
    Copyright 2025 Mathieu Siol, Stéphane De Mita

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import _freq
from .. import _site

[docs] def SFS(sites, struct=None, max_missing=0.0, mode='auto', nbins=None, skip_fixed=False): """ Compute the site frequency spectrum. This function processes an list of sites (or any iterator yielding :class:`.Site` instances) and computes the distribution of frequencies of either the minority or the derived allele of diallelic sites. :param sites: an iterator over :class:`.Site` instances. Unless *nbins* is specified, all sites are required to have the same number of samples. :param struct: a :class:`.Structure` instance defining the samples to process and/or the outgroup samples (necessary for the unfolded spectrum). :param max_missing: maximum relative proportion of missing data in the ingroup to process a site. Warning: when using this option without binarization (absolute frequencies), the results might be unreliable as sites with missing data have less samples. :param mode: indicate whether the SFS should be folded or unfolded (possible only when outgroup is present in struct). ``auto``: as implied by the outgroup; ``folded``: folded (even if there is an outgroup); ``unfolded``: unfolded (causing an error if no outgroup is present). :params nbins: number of bins on which to calculate the SFS. If ``None``, the full SFS is returned as a list. :params skip_fixed: boolean indicating whether fixed sites should be counted of not. If set to ``True`` with a full, folded SFS (``nbins=None`` with no outgroup used), the first and last count values are set to ``None`` but kept in the returned list. :return: If bins are specified: a list of ``(bound, count)`` tuples where *bound* is the top limit of each bin, which is inclusive, and *count* is the number of sites falling in this category. Otherwise (``nbins=None``): a list containing the absolute SFS, giving the number of sites corresponding to all possible frequencies ranging from 0 to ``n/2+1`` (if folded) or ``n-1`` (unfolded), inclusive. .. versionadded:: 3.4 """ if max_missing < 0 or max_missing > 1: raise ValueError('max_missing out of bound') max_missing += 1e-12 # guard against rounding errors # create an internal variable to determine if folded or not (once and for all) if mode == "auto": if struct is None: folded = True else: if struct.num_indiv_outgroup != 0: folded = False else: folded = True elif mode == 'folded': folded = True elif mode == 'unfolded': if struct is None or struct.num_indiv_outgroup == 0: raise ValueError('cannot process unfolded SFS with no outgroup information') folded = False else: raise ValueError('invalid value for option mode') _ns = None freq = _freq.Freq() # absolute SFS if nbins is None: for site in sites: if not isinstance(site, _site.Site): raise TypeError('expect a Site instance') if _ns is None: _ns = site.ns nseff = _ns if struct is None else struct.ns if folded == False: counts = [0] * (nseff+1) elif folded == True: counts = [0] * (nseff//2+1) else: if site.ns != _ns: raise ValueError('inconsistent number of samples between sites') freq.from_site(site, struct) if freq.num_alleles > 2: continue if freq.nseff() == 0: continue if 1-freq.nseff()/nseff > max_missing: continue if freq.num_alleles == 1: if not skip_fixed: counts[0] += 1 elif folded: p = min(freq.freq_allele(0, cpt = _freq.Freq.ingroup), freq.freq_allele(1, cpt = _freq.Freq.ingroup)) if not skip_fixed or p > 0: counts[p] += 1 else: # get derived allele if outgroup present l = (freq.freq_allele(0, cpt = _freq.Freq.outgroup), freq.freq_allele(1, cpt = _freq.Freq.outgroup)) if 0 not in l: # polymorphic in outgroup (cannot orientate site) continue derived = 1 - l.index(max(l)) p = freq.freq_allele(derived, cpt = _freq.Freq.ingroup) if not skip_fixed or (p > 0 and p < nseff): counts[p] += 1 if skip_fixed: if counts[0] != 0: raise RuntimeError('zero-frequency category unexceptedly non-empty') counts[0] = None if not folded: if counts[-1] != 0: raise RuntimeError('last frequency category unexceptedly non-empty') counts[-1] = None return counts # binarized SFS else: if nbins < 2: raise ValueError('invalid number of bins') if folded: step = 0.5/nbins else: step = 1/nbins bounds = [i*step for i in range(1, nbins+1)] counts = [0 for i in range(len(bounds))] for site in sites: if not isinstance(site, _site.Site): raise TypeError('expect a Site instance') nseff = site.ns if struct is None else struct.ns freq.from_site(site, struct) if freq.nseff() == 0: continue if freq.num_alleles > 2: continue if skip_fixed and freq._obj.frq_ingroup().num_alleles_eff() == 1: continue if 1-freq.nseff()/nseff > max_missing: continue if folded: if freq.num_alleles == 1: counts[0] += 1 else: minor = min(freq.freq_allele(0, cpt = _freq.Freq.ingroup), freq.freq_allele(1, cpt = _freq.Freq.ingroup))/freq.nseff() if minor > 1e-12 or not skip_fixed: c = 0 while True: if minor > bounds[c]: c += 1 else: break counts[c] += 1 else: if freq.num_alleles == 1: counts[0] += 1 else: lst = (freq.freq_allele(0, cpt = _freq.Freq.outgroup), freq.freq_allele(1, cpt = _freq.Freq.outgroup)) if 0 not in lst: # polymorphic in outgroup (cannot orientate site) continue derived = 1 - lst.index(max(lst)) rel = freq.freq_allele(derived, cpt = _freq.Freq.ingroup)/freq.nseff() if not skip_fixed or not (rel < 1e-12 or rel > (1-1e-12)): c = 0 while True: if rel > bounds[c]: c += 1 else: break counts[c] += 1 return list(zip(bounds, counts))