Source code for egglib.stats._paralog_pi
"""
Copyright 2016-2021 Stéphane De Mita, Mathieu Siol
This file is part of EggLib.
EggLib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EggLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with EggLib. If not, see <http://www.gnu.org/licenses/>.
"""
from .. import eggwrapper as _eggwrapper
from .. import _interface
[docs]def paralog_pi(align, struct_p, struct_i, max_missing=0.0):
"""
Compute diversity statistics for a gene family. Based on
Innan (*Genetics* 2003 **163**:803-810). An
estimate of genetic diversity is provided for every paralog and for
every pair of paralogs, provided that enough non-missing data is
available (at least 2 samples are required). Note that sites with
more than two alleles are always considered.
:param align: an :class:`.Align` containing the sequence of all
available paralog for all samples. The outgroup is ignored.
:param struct_p: a :class:`.Structure` providing the organisation of
sequences in paralogs. This structure must have a ploidy of 1 (no
individual structure). Clusters, if defined, are ignored. The
sequences of all individuals for a given paralog should be grouped
together in that structure. There might be a different number of
sequences per paralog due to missing data.
:param struct_i: a :class:`.Structure` providing the organisation of
sequences in individuals. This structure must have a ploidy of 1
(no individual structure). Clusters, if defined, are ignored. The
sequences of all paralogs for a given individual should be grouped
together in that structure. There might be a different number of
sequences per individual due to missing data.
:param max_missing: maximum relative proportion of missing data (if there are
more missing data at a site, the site is ignored altogether).
.. note::
Here, *max_missing* is a relative proportion to allow
using the same value for different alignments that might
not have the same number of samples (to avoid reevaluating
*max_missing* if the user wants the same maximum rate
of missing data). In other functions, *max_missing* is
the maximum *number* of missing and is an integer.
:return: A new :class:`.ParalogPi` instance which provides methods
to access the number of used sites and the diversity for each
paralog/paralog pair.
"""
pp = ParalogPi()
pp.setup(struct_p, struct_i, max_missing)
pp.process_align(align)
return pp
[docs]class ParalogPi(object):
"""
Compute diversity statistics for a gene family.
See :func:`.paralog_pi` for more details. This class
can be used directly (1) to analyse data with more efficiency (by
reusing the same instance) or (2) to combine data from different
alignments, or (3) for pass individual sites. Do first call
:meth:`.setup`.
"""
def __init__(self):
self._obj = _eggwrapper.ParalogPi()
self._req = 0
[docs] def setup(self, struct_p, struct_i, max_missing=0.0):
"""
Specify the structure in paralog and individuals. The arguments
are as as described for :func:`.paralog_pi`.
Only this method resets the instance.
"""
if struct_p.ploidy != 1: raise ValueError('ploidy is required to be 1')
if struct_i.ploidy != 1: raise ValueError('ploidy is required to be 1')
if max_missing < 0 or max_missing > 1: raise ValueError('max_missing out of bound')
self._req = struct_p.req_ns
self._obj.reset(struct_p._obj, struct_i._obj, max_missing)
[docs] def process_align(self, aln):
"""
Process an alignment. The alignment must match the structure
passed to :meth:`.setup`.
Diversity estimates are incremented (no reset).
:param aln: an :class:`.Align` instance.
"""
if not isinstance(aln, _interface.Align): raise TypeError('expect an Align instance')
if aln.ns < 2: return
if aln.ns > self._req: raise ValueError('unsufficient number of samples in alignment')
for site in aln.iter_sites():
self._obj.load(site._obj)
[docs] def process_site(self, site):
"""
Process a site. The site must match the structure passed to
:meth:`.setup`. Diversity estimates are incremented (no reset).
:param site: a :class:`.Site` instance.
"""
if site.ns < 2: return
if site.ns > self._req: raise ValueError('unsufficient number of samples in site')
self._obj.load(site._obj)
[docs] def num_sites(self, *args):
"""
num_sites([i[, j]])
Number of sites with data. Number of sites: with any data
(without arguments), with data for
paralog *i* (if only *i* specified), with data for the pair
of paralogs *i* and *j* (if both specified). This counts the number
of sites which have not been excluded based on the *num_missing* argument,
and which have at least one pair of samples for the considered sample.
"""
if len(args) == 0:
return self._obj.num_sites_tot()
elif len(args) == 1:
if args[0] < 0 or args[0] >= self._obj.num_paralogs(): raise IndexError('invalid paralog index')
return self._obj.num_sites_paralog(args[0])
elif len(args) == 2:
if args[0] < 0 or args[0] >= self._obj.num_paralogs(): raise IndexError('invalid paralog index')
if args[1] < 0 or args[1] >= self._obj.num_paralogs() or args[1] == args[0]: raise IndexError('invalid paralog index')
return self._obj.num_sites_pair(args[0], args[1])
else:
raise ValueError('invalid number of arguments')
[docs] def Piw(self, i):
"""
Within-paralog diversity for paralog *i*.
"""
if i<0 or i>=self._obj.num_paralogs(): raise IndexError('invalid paralog index')
if self._obj.num_sites_paralog(i) < 1: return None
return self._obj.Piw(i)
[docs] def Pib(self, i, j):
"""
Between-paralog diversity for paralogs *i* and *j*.
"""
if i<0 or i>=self._obj.num_paralogs(): raise IndexError('invalid paralog index')
if j<0 or j>=self._obj.num_paralogs() or j==i: raise IndexError('invalid paralog index')
if self._obj.num_sites_pair(i, j) < 1: return None
return self._obj.Pib(i, j)