Source code for egglib.stats._haplotypes

"""
    Copyright 2015-2021 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import eggwrapper as _eggwrapper
from .. import _interface, alphabets
from .. import _site

alphabet_genotypes = alphabets.Alphabet('range', (0, None), (-1, 0),
    case_insensitive=False, name='genotypes')

[docs]def haplotypes_from_align(aln, max_missing=0.0, impute_threshold=0, struct=None, dest=None, multiple=False): """ Identification of haplotypes from an alignment. Identify haplotypes an :class:`.Align` instance and return data as a single :class:`.Site` instance containing one sample for each sample of the original data. Alleles in the returned site are representing all identified haplotypes (or missing data when the haplotypes could not be derived). .. note:: There must be at least one site with at least two alleles (overall, including the outgroup), otherwise the produced site only contains missing data. :param aln: an :class:`.Align` instance. :param impute_threshold: by default, all samples with a least one occurrence of missing data will be treated as missing data. If this argument is more than 0, the provided value will be used as maximum number of missing data. All samples with as many or less missing data will be processed to determine which extant of haplotype they might belong (to which they are identical save for missing data). If there is only one such haplotype, the corresponding samples will be treated as a repetition of this haplotype. This option will never allow detecting new haplotypes. Only small values of this option make sense. :param struct: a :class:`.Structure` instance defining the samples to process. The population and cluster structures are not used. If the ploidy is larger than 1, the individuals are used, and sites are assumed to be phased. :param max_missing: maximum relative proportion of missing data in the ingroup to process a site. .. note:: Here, *max_missing* is a relative proportion to allow using the same value for different alignments that might not have the same number of samples (to avoid reevaluating *max_missing* if the user wants the same maximum rate of missing data). In other functions, *max_missing* is the maximum *number* of missing and is an integer. :param dest: a :class:`.Site` instance that will be reset and in which data will be placed. If specified, this function returns nothing. By default, the function returns a new :class:`.Site` instance. :param multiple: allow sites with more than two alleles in the ingroup. :return: A :class:`.Site` instance, (if *dest* is ``None``) or ``None`` (otherwise). .. note:: If an empty list of sites is provided and no structure is provided, a :exc:`ValueError` is raised. To support smoothly possibly empty lists, the used is prompted to provide a structure. """ if not isinstance(aln, _interface.Align): raise TypeError('expect an Align instance') return _haplotypes(aln, max_missing, impute_threshold, struct, dest, multiple)
[docs]def haplotypes_from_sites(sites, impute_threshold=0, struct=None, dest=None, multiple=False): """ Identification of haplotypes from a list of sites. Similar to :meth:`haplotypes_from_align` but takes a list of :class:`.Site` instances. No option *max_missing* is available; all sites are always considered. """ if not isinstance(sites, list): raise TypeError('expect a list of Site instances') if set(map(type, sites)) != set([_site.Site]): raise TypeError('expect a list of Site instances') return _haplotypes(sites, 1.0, impute_threshold, struct, dest, multiple)
def _haplotypes(sites, max_missing=0.0, impute_threshold=0, struct=None, dest=None, multiple=False): # check arguments / init if max_missing < 0.0 or max_missing > 1.0: raise ValueError('max_missing argument out of range') if impute_threshold < 0: raise ValueError('invalid value for impute_threshold argument') obj = _eggwrapper.Haplotypes() frq = _eggwrapper.FreqHolder() if struct is None: if isinstance(sites, _interface.Align): ni = sites._obj.get_nsam() else: if len(sites) == 0: raise ValueError('cannot process an empty list of sites') ni = sites[0]._obj.get_ns() struct_obj = _eggwrapper.StructureHolder() struct_obj.mk_dummy_structure(ni, 1) struct2_obj = struct_obj # used only with haplotypes_from_align pl = 1 no = 0 else: struct_obj = struct._obj struct2_obj = struct.make_sorted_auxiliary()._obj # used only with haplotypes_from_align pl = struct_obj.get_ploidy() ni = struct_obj.num_indiv_ingroup() no = struct_obj.num_indiv_outgroup() obj.setup(struct_obj) impute_idx = [] min_exploitable = ni * pl - int(max_missing * pl * ni) # pass sites if they come from an alignment genosite = _eggwrapper.Genotypes() if isinstance(sites, _interface.Align): if struct_obj.get_req() > sites._obj.get_nsam(): raise ValueError('structure object does not match alignment') site = _eggwrapper.SiteHolder() for i in range(sites._obj.get_nsit_all()): site.reset() if site.process_align(sites._obj, i, struct_obj) < min_exploitable: # process_align returns number of valid ingroup samples continue frq.setup_structure(struct2_obj) frq.process_site(site) genosite.process(site, struct2_obj) nall = frq.frq_ingroup().num_alleles_eff() if frq.num_alleles() > 1 and (multiple == True or nall < 3): obj.load(genosite) if impute_threshold > 0: impute_idx.append(i) if obj.n_sites() == 0: if dest is None: return _site.site_from_list([-1 for i in range(ni+no) for j in range(pl)], alphabet = alphabet_genotypes) else: dest.process_list([-1 for i in range(ni+no) for j in range(pl)], alphabet = alphabet_genotypes, reset=True) return None # process list of Site else: if len(sites) > 0: ns = sites[0]._obj.get_ns() for i, site in enumerate(sites): if site._obj.get_ns() != ns: raise ValueError('number of samples must be consistent between sites') frq.setup_structure(struct_obj) frq.process_site(site._obj) if frq.frq_ingroup().nseff()+frq.frq_outgroup().nseff() < min_exploitable: continue genosite.process(site._obj, struct_obj) nall = frq.frq_ingroup().num_alleles_eff() if frq.num_alleles() > 1 and (multiple == True or nall < 3): obj.load(genosite) if impute_threshold > 0: impute_idx.append(i) # complete haplotype detection obj.cp_haplotypes() # impute if requested if impute_threshold > 0: obj.prepare_impute(impute_threshold) for i in impute_idx: if isinstance(sites, _interface.Align): site.reset(pl) site.process_align(sites._obj, i, _eggwrapper.MAX, True) obj.resolve(site) else: obj.resolve(sites[i]._obj) obj.impute() # generate site if dest is None: dest = _site.Site() dest._alphabet = alphabet_genotypes obj.get_site(dest._obj) return dest else: dest._alphabet = alphabet_genotypes obj.get_site(dest._obj)