Source code for egglib.io

"""
    Copyright 2015-2025 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from ._ms import to_ms
from ._fasta import fasta_iter, from_fasta, from_fasta_string
from ._gff3 import GFF3, Gff3Feature
from ._vcf import VcfParser, VcfStringParser, VcfVariant, make_vcf_index, VcfSlidingWindow, VcfWindow, BED
from ._genbank import GenBank, GenBankFeature, GenBankFeatureLocation
from ._legacy import from_clustal, from_staden, from_genalys, get_fgenesh
from ._genepop import from_genepop
from .. import config
from .. import _site
from .. import alphabets

#: First available position of a contig.
FIRST = _vcf.FIRST

#: Last available position of a contig.
LAST = _vcf.LAST

try:
    from ._vcfparser import VCF, index_vcf
    from ._vcfslider import VcfSlider
    config.htslib = 1
    class VCF(VCF):
[docs] def as_site(self): """ Extract current variant. Return the current genotype values as a :class:`.Site` object. The alphabet is DNA if the site is a SNP, or a string-type alphabet with ad hoc alleles otherwise. .. versionadded:: 3.4 """ site = _site.Site() genotypes = self.get_genotypes() if genotypes is None: raise TypeError('genotype field not available for this variant') match self.get_allele_type(): case 0: alphabet = alphabets.DNA case 1: alphabet = alphabets.Alphabet('string', self.get_alleles(), ['?']) case 2: alphabet = alphabets.Alphabet('custom', self.get_alleles(), ['?']) case None: return None case _: raise RuntimeError('unexpected return code of VCF.get_allele_type()') site.from_list([j if j is not None else '?' for i in genotypes for j in i], alphabet = alphabet) site.position = self.get_pos() site.chrom = self.get_chrom() return site
[docs] def iter_sites(self, *args, **kwargs): """iter_sites([chrom[, start][, stop]][, max_missing][, only_snp]) Return an iterator over sites. Each variant found in the VCF file is returned as a :class:`.Site` instance. By default, process sites from the current position up to the end of the file. To control the region used for iteration, use the *chrom*, *start*, and *stop* argument. :param chrom: name of the chromosome to process. If *None*, process all chromosomes. By default (if *start* and *stop* are not specified) process the whole chromosome, going back to the first position if needed. Only available if the VCF is indexed. :param start: start position. Only allowed if *chrom* is specified. The first site returned will be at the smallest available position starting from *start*. By default, start at the beginning of the chromosome. :param stop: stop position. Only allowed if *chrom* is specified. The position of the last site returned will be a most the one before the *stop* position. By default, stop at the last site of the chromosome. :param max_missing: maximum number of missing data to consider a variant. By default, all sites with at least one missing data are ignored. :param mode: 0: include only SNP variants (variants with at least two alleles, all corresponding to a single nucleotide, although those alleles are not required to be called in genotypes); 1: include SNP variants and invariant positions; 2: include all variants from the VCF. .. versionadded:: 3.4 """ return self.site_iterator(self, *args, **kwargs)
class site_iterator: def __init__(self, vcf, chrom=None, start=None, stop=None, max_missing=0, mode=0): if start is not None and chrom is None: raise ValueError('cannot specify `start` without `chrom`') if stop is not None: if stop < 0: raise ValueError('invalid value for `stop`') if chrom is None: raise ValueError('cannot specify `stop` without `chrom`') if max_missing < 0: raise ValueError('invalid value for `max_missing`') if mode not in {0, 1, 2}: raise ValueError('invalid value for `mode`') if chrom is not None: if start is None: self.b = vcf.goto(chrom) else: self.b = vcf.goto(chrom, start, vcf.END) else: self.b = vcf.read() self.vcf = vcf self.chrom = chrom self.stop = stop self.max_missing = max_missing self.mode = mode def __iter__(self): return self def __next__(self): while True: if not self.b: raise StopIteration if self.chrom is not None and self.vcf.get_chrom() != self.chrom: raise StopIteration if self.stop is not None and self.vcf.get_pos() >= self.stop: raise StopIteration if ( (self.mode == 0 and self.vcf.is_snp()) or (self.mode == 1 and self.vcf.is_single()) or self.mode == 2 ): site = self.vcf.as_site() if site.num_missing <= self.max_missing: self.b = self.vcf.read() return site self.b = self.vcf.read() VCF.__doc__ = _vcfparser.VCF.__doc__ except ImportError: config.htslib = 0
[docs] def VCF(*args, **kwargs): raise NotImplementedError('EggLib was compiled without htslib. Please refer to the installation instructions on https://egglib.org/install.html for more information.')
def index_vcf(*args, **kwargs): raise NotImplementedError('EggLib was compiled without htslib. Please refer to the installation instructions on https://egglib.org/install.html for more information.') def VcfSlider(*args, **kwargs): raise NotImplementedError('EggLib was compiled without htslib. Please refer to the installation instructions on https://egglib.org/install.html for more information.')