Source code for egglib.tools._seq_manip

"""
    Copyright 2008-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys, re, functools
from .. import _interface
from .. import alphabets

_iupac = {
    '?': set('ACGT-'),
    '-': set('-'),
    'A': set('A'),
    'C': set('C'),
    'G': set('G'),
    'T': set('T'),
    'M': set('AC'),
    'R': set('AG'),
    'W': set('AT'),
    'S': set('CG'),
    'Y': set('CT'),
    'K': set('GT'),
    'B': set('CGT'),
    'D': set('AGT'),
    'H': set('ACT'),
    'V': set('ACG'),
    'N': set('ACGT')
}

for _i in list(_iupac.keys()):
    if _i != '?' and _i != '-': # it would be no harm but awkward
        _iupac[_i.lower()] = _iupac[_i]

_iupac_set = set(_iupac.keys())
_bases_set = _iupac_set - set('-?')

_trans = dict(zip('ACGT-?', 'TGCA-?'))

_a = []
_b = []
for _i, _values in _iupac.items():
    if _i.islower(): continue
    values = set([_trans[_j] for _j in _values])
    for _j, _k in _iupac.items():
        if _k == values:
            _a.append(_i)
            _b.append(_j)
            if _i.lower() != _i:
                _a.append(_i.lower())
                _b.append(_j.lower())
            break
    else: raise RuntimeError('internal error')
_rc_conv =  str.maketrans(''.join(_a), ''.join(_b))

_regex_map = {}
for _i, _j in _iupac.items():
    _map = set()
    for _m, _n in _iupac.items():
        if _m.isupper():
            for _k in _n:
                 if _k not in _j: break
            else:
                _map.add(_m)
    _regex_map[_i] = _map.pop() if len(_map) == 1 else '[' + ''.join(_map) + ']'

_gap_codes = {
    'DNA': [alphabets.DNA.get_code('-')],
    'protein': [alphabets.protein.get_code('-')],
    'codons': [alphabets.codons.get_code(allele) for allele in alphabets.codons.get_alleles()[1] if '-' in allele]
}

[docs]def rc(seq): """ Reverse-complement a DNA sequence. :param seq: input nucleotide sequence, as a :class:`str`, or a :class:`.SequenceView`. :return: The reverse-complemented sequence (see details below). The case of the provided sequence is preserved. Ambiguity characters are complemented according to the table shown :ref:`here <iupac-nomenclature>`. Invalid characters characters raise a :exc:`ValueError`. """ if isinstance(seq, _interface.SequenceView): seq = seq.string() if not _iupac_set.issuperset(seq): raise ValueError('invalid character in DNA sequence') return seq[::-1].translate(_rc_conv)
[docs]def compare(seq1, seq2): """ Compare two sequences. The comparison supports ambiguity characters (see :ref:`here <iupac-nomenclature>`) such that partially overlapping ambiguity sets characters are not treated as different. For example, ``A`` and ``M`` are not treated as different, nor are ``M`` and ``R``. Only IUPAC characters may be supplied. :param seq1: a DNA sequence (as a :class:`str` or a :class:`.SequenceView`). :param seq2: another DNA sequence (idem). :return: ``True`` if sequences have the same length and they either are identical or differ only by overlapping IUPAC characters, ``False`` otherwise. """ if isinstance(seq1, _interface.SequenceView): seq1 = seq1.string() if isinstance(seq2, _interface.SequenceView): seq2 = seq2.string() if len(seq1) != len(seq2): return False try: for i, j in zip(seq1, seq2): if _iupac[i].isdisjoint(_iupac[j]): return False except KeyError as e: raise ValueError('unknown DNA base: {0}'.format(e.args[0])) return True
[docs]def regex(query, both_strands=False): """ Turn a DNA sequence into a regular expression. The input sequence should contain IUPAC characters only (see :ref:`here <iupac-nomenclature>`). Ambiguity characters will be teated as follows: an ambiguity will match either (1) itself, (2) one of the characters it defines, or (3) one of the ambiguity characters that define a subset of the characters it defines. For example, a ``M`` in the query will match an ``A``, a ``C`` or a ``M``, and a ``D`` will match all the following: ``A``, ``G``, ``T``, ``R``, ``W``, ``K``, and ``D``. The fully degenerated ``N`` matches all four bases and all intermediate ambiguity characters and, finally, ``?`` matches all allowed characters (including alignment gaps). Note ``-`` only matches itself. :param query: a :class:`str` containing IUPAC characters or a :class:`.SequenceView`. :param both_strands: look for the query on both forward and reverse strands (by default, only on forward strand). :return: A regular expression as a :class:`!str` expanding ambiguity characters to all compatible characters. Result of this function can be used with the module :mod:`re` to locate occurrences of a motif or the position of a sequence as in: .. code:: python >>> regex = egglib.tools.regex(query) >>> for hit in re.finditer(regex, subject): ... print(hit.start(), hit.end(), hit.group(0)) The returned regular expression includes upper-case characters, regardless of the case of input characters. To perform a case-insensitive search, use the :data:`re.IGNORECASE` flag of the :mod:`!re` module in regular expression searches (as in ``re.search(egglib.tools.regex(query), subject, re.IGNORECASE)``. Note that the regular expression is contained into a group if *both_strands* is ``False``, and two groups otherwise (one for each strand). """ if isinstance(query, _interface.SequenceView): query = query.string() chars = list(query) for i, v in enumerate(chars): if v not in _regex_map: raise ValueError('invalid character in DNA sequence') chars[i] = _regex_map[v] if both_strands: return '({0})|{1}'.format(''.join(chars), regex(rc(query), False)) else: return '(' + ''.join(chars) + ')'
[docs]def motif_iter(subject, query, mismatches=0, both_strands=False, case_independent=True, only_base=True): """ Iterate over hits of a given query. Return an iterator over hits of the provided query in the subject sequence. The query should only contain bases (including IUPAC ambiguity characters as listed in :ref:`here <iupac-nomenclature>`, but excluding ``-`` and ``?``). Ambiguity characters are treated as described in :func:`.regex` (the bottom line is that ambiguities in the query can only match identical or less degenerate ambiguities in the subject). Mismatches are allowed, and the iterator will first yield identical hits (if they exist), then hits with one mismatch, then hits with two mismatches, and so on. :param subject: a DNA sequence as :class:`str` or a :class:`.SequenceView`. The sequence may contain ambiguity characters. In principle, the subject sequence should contain IUPAC characters only, but this is not strictly enforced (non-IUPAC characters will always be treated as different of IUPAC characters). :param query: a DNA sequence as :class:`!str` or a :class:`!SequenceView`, also containing IUPAC characters only. :param mismatches: maximum number of mismatches. :param both_strands: look for the query on both forward and reverse strands (by default, only on forward strand). :param case_independent: if ``True``, perform case-independent searches. :param only_base: if ``True``, never create a hit if the putative hit sequence contains an alignment gap (``-``) or a ``?`` character, irrespective of the number of allowed mismatches. :return: An iterator returning, for each hit, a ``(start, stop, strand, num)`` tuple where ``start`` and ``stop`` are the position of the hit such that ``subject[start:stop]`` returns the hit sequence, ``strand`` is the strand as ``+`` or ``-``, and ``num`` is the number of mismatches of the hit. .. note:: If a given query has a hit on both strands at the same position (this only happens if the sequence is a motif equal to its reverse-complement like ``AATCGATT``), it is guaranteed that the only one hit will be returned for a given position (no twice on each strand). However, the value of the ``strand`` flag is not defined. .. warning:: This function is designed to be efficient only if the number of mismatches is small (only a few). A combination of a large number of mismatches (more than 2 or 3) and a large query sequence (as early as 10 or 20), will take significant time to complete. More complex problems require the use of genuine pairwise ocal alignment tools. """ if isinstance(subject, _interface.SequenceView): subject = subject.string() if isinstance(query, _interface.SequenceView): query = query.string() # get query length ln = len(query) if ln < 1: raise ValueError('query must not be empty') if not _bases_set.issuperset(query): raise ValueError('invalid character in DNA sequence') if mismatches > ln: raise ValueError('too many mismatches allowed') # initialize table of positions of mismatches # if n>1, mismatch indexes are always increasing (first value for n=3 is [0,1,2]) n = 1 # pos cannot be incremented if 0 pos = [-1] # the last (and only) index will be incremented once before use # current query (initially without mismatches) cur = list(query) # cache to avoid returning several times the same hit cache = set() # process all possibilities (no mismatches, then 1, then 2, etc.) while True: # perform search (will be done at least once) for mo in re.finditer(regex(''.join(cur), both_strands), subject, re.I if case_independent else 0): if mo.start() not in cache: yield mo.start(), mo.end(), '+-'[mo.lastindex-1], 0 if pos[0]==-1 else n cache.add(mo.start()) # increment mismatch index (initialized to -1) i = n -1 # current mismatch index pos[i] += 1 # increment last index stop = ln # stop value (maximum pos for n=3 and ln=10 is [7,8,9]) # increment previous index if end is reached (and so one) while pos[i] == stop: # all mismatches processed, add a new mismatch if i == 0: n += 1 pos = list(range(n)) break # increment previous index else: i -= 1 stop -= 1 pos[i] += 1 pos[i+1:] = range(pos[i]+1, pos[i]+n-i) # quit if already too many mismatches if n > mismatches: break # create a list from the query with allowed mismatches (at least 1) cur = list(query) for i, v in enumerate(pos): cur[v] = 'N' if only_base else '?'
def _get_gaps(gaps, alph): # helper to validate gap option if gaps is None: gap_codes = _gap_codes.get(alph.name, None) if gap_codes is None: raise ValueError('`gap` argument is required for non-standard alphabets') else: if not isinstance(gaps, (list, tuple)): raise TypeError('invalid type for gaps') if len(gaps) == 0: raise ValueError('at least one gap value is expected') try: gap_codes = list(map(alph.get_code, gaps)) except ValueError: raise ValueError('invalid gap value for the input alphabet') return gap_codes
[docs]def ungap_all(obj, gaps=None): """ Remove all gaps from an object. Generate a :class:`.Container` instance containing all sequences of the provided object excluding any gaps. :param obj: input object as an :class:`.Align` or a :class:`.Container`. :param gaps: the list of allelic value(s) representing gaps. The argument must be a :class:`list` or :class:`tuple` containing one gap allele value, or more than one if appropriate. All values must be valid alleles for the input alphabet. By default, use the ``-`` character for the DNA and protein alphabets, and all triplets containing at least one alignment gaps for the codon alphabet. There is no default for other alphabets. :return: A :class:`!Container` instance. """ if not isinstance(obj, (_interface.Align, _interface.Container)): raise TypeError('expect an Align or a Container instance') gap_codes = _get_gaps(gaps, obj._alphabet) ls = obj.ls result = _interface.Container(alphabet=obj._alphabet) for i in range(obj.ns): result.add_sample( obj.get_name(i), [obj._alphabet.get_value(obj._obj.get_sample(i, j)) for j in range(ls) if obj._obj.get_sample(i, j) not in gap_codes], obj.get_sample(i).labels) return result
[docs]def ungap(align, freq, gaps=None): """ Remove sites with too many gaps. Generate a new :class:`.Align` instance containing all sequences of the provided alignment but with only those sites for which the frequency of alignment gaps is less than or equal to *freq*. :param align: input alignment as an :class:`!Align`. :param freq: maximum gap frequency in a site (if there are more gaps, the site is not included in the returned :class:`.Align` instance). This value is a relative frequency (included in the [0, 1] range). :param gaps: the list of allelic value(s) representing gaps. The argument must be a :class:`list` or :class:`tuple` containing one gap allele value, or more than one if appropriate. All values must be valid alleles for the alignment alphabet. By default, use the ``-`` character for the DNA and protein alphabets, and all triplets containing at least one alignment gaps for the codon alphabet. There is no default for other alphabets. :return: A new :class:`!Align` instance. """ # process arguments if not isinstance(align, _interface.Align): raise TypeError('expect an Align instance') gap_codes = _get_gaps(gaps, align._alphabet) # get statistics ls = align.ls if freq < 0.0 or freq > 1.0: raise ValueError('`freq` value is out of range') ns = align.ns if ns == 0: raise ValueError('not enough sequences in alignment') # collect retained positions good = [] for i in range(ls): gaps = 0 for j in range(ns): if align._obj.get_sample(j, i) in gap_codes: gaps += 1 if gaps/ns <= freq: good.append(i) # return requested object return align.extract(good)