"""
Copyright 2008-2023 Stephane De Mita, Mathieu Siol
This file is part of EggLib.
EggLib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EggLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with EggLib. If not, see <http://www.gnu.org/licenses/>.
"""
import sys, re, functools
from .. import _interface
from .. import alphabets
_iupac = {
'?': set('ACGT-'),
'-': set('-'),
'A': set('A'),
'C': set('C'),
'G': set('G'),
'T': set('T'),
'M': set('AC'),
'R': set('AG'),
'W': set('AT'),
'S': set('CG'),
'Y': set('CT'),
'K': set('GT'),
'B': set('CGT'),
'D': set('AGT'),
'H': set('ACT'),
'V': set('ACG'),
'N': set('ACGT')
}
for _i in list(_iupac.keys()):
if _i != '?' and _i != '-': # it would be no harm but awkward
_iupac[_i.lower()] = _iupac[_i]
_iupac_set = set(_iupac.keys())
_bases_set = _iupac_set - set('-?')
_trans = dict(zip('ACGT-?', 'TGCA-?'))
_a = []
_b = []
for _i, _values in _iupac.items():
if _i.islower(): continue
values = set([_trans[_j] for _j in _values])
for _j, _k in _iupac.items():
if _k == values:
_a.append(_i)
_b.append(_j)
if _i.lower() != _i:
_a.append(_i.lower())
_b.append(_j.lower())
break
else: raise RuntimeError('internal error')
_rc_conv = str.maketrans(''.join(_a), ''.join(_b))
_regex_map = {}
for _i, _j in _iupac.items():
_map = set()
for _m, _n in _iupac.items():
if _m.isupper():
for _k in _n:
if _k not in _j: break
else:
_map.add(_m)
_regex_map[_i] = _map.pop() if len(_map) == 1 else '[' + ''.join(_map) + ']'
_gap_codes = {
'DNA': [alphabets.DNA.get_code('-')],
'protein': [alphabets.protein.get_code('-')],
'codons': [alphabets.codons.get_code(allele) for allele in alphabets.codons.get_alleles()[1] if '-' in allele]
}
[docs]def rc(seq):
"""
Reverse-complement a DNA sequence.
:param seq: input nucleotide sequence, as a :class:`str`, or a :class:`.SequenceView`.
:return: The reverse-complemented sequence (see details below).
The case of the provided sequence is preserved. Ambiguity characters
are complemented according to the table shown :ref:`here <iupac-nomenclature>`.
Invalid characters characters raise a :exc:`ValueError`.
"""
if isinstance(seq, _interface.SequenceView): seq = seq.string()
if not _iupac_set.issuperset(seq): raise ValueError('invalid character in DNA sequence')
return seq[::-1].translate(_rc_conv)
[docs]def compare(seq1, seq2):
"""
Compare two sequences. The comparison supports ambiguity characters
(see :ref:`here <iupac-nomenclature>`) such that partially overlapping
ambiguity sets characters are not treated as different. For example,
``A`` and ``M`` are not treated as different, nor are ``M`` and
``R``. Only IUPAC characters may be supplied.
:param seq1: a DNA sequence (as a :class:`str` or a :class:`.SequenceView`).
:param seq2: another DNA sequence (idem).
:return: ``True`` if sequences have the same length and they either
are identical or differ only by overlapping IUPAC characters,
``False`` otherwise.
"""
if isinstance(seq1, _interface.SequenceView): seq1 = seq1.string()
if isinstance(seq2, _interface.SequenceView): seq2 = seq2.string()
if len(seq1) != len(seq2):
return False
try:
for i, j in zip(seq1, seq2):
if _iupac[i].isdisjoint(_iupac[j]):
return False
except KeyError as e:
raise ValueError('unknown DNA base: {0}'.format(e.args[0]))
return True
[docs]def regex(query, both_strands=False):
"""
Turn a DNA sequence into a regular expression. The input sequence
should contain IUPAC characters only (see
:ref:`here <iupac-nomenclature>`). Ambiguity characters will be teated as
follows: an ambiguity will match either (1) itself, (2) one of the
characters it defines, or (3) one of the ambiguity characters that
define a subset of the characters it defines. For example, a ``M``
in the query will match an ``A``, a ``C`` or a ``M``, and a ``D``
will match all the following: ``A``, ``G``, ``T``, ``R``, ``W``,
``K``, and ``D``. The fully degenerated ``N`` matches all four bases
and all intermediate ambiguity characters and, finally, ``?``
matches all allowed characters (including alignment gaps). Note ``-``
only matches itself.
:param query: a :class:`str` containing IUPAC characters or a :class:`.SequenceView`.
:param both_strands: look for the query on both forward and reverse
strands (by default, only on forward strand).
:return: A regular expression as a :class:`!str` expanding ambiguity
characters to all compatible characters.
Result of this function can be used with the module :mod:`re` to
locate occurrences of a motif or the position of a sequence as in:
.. code:: python
>>> regex = egglib.tools.regex(query)
>>> for hit in re.finditer(regex, subject):
... print(hit.start(), hit.end(), hit.group(0))
The returned regular expression includes upper-case characters,
regardless of the case of input characters. To perform a
case-insensitive search, use the :data:`re.IGNORECASE` flag of the
:mod:`!re` module in regular expression searches (as in
``re.search(egglib.tools.regex(query), subject, re.IGNORECASE)``.
Note that the regular expression is contained into a group if
*both_strands* is ``False``, and two groups otherwise (one for each
strand).
"""
if isinstance(query, _interface.SequenceView): query = query.string()
chars = list(query)
for i, v in enumerate(chars):
if v not in _regex_map: raise ValueError('invalid character in DNA sequence')
chars[i] = _regex_map[v]
if both_strands: return '({0})|{1}'.format(''.join(chars), regex(rc(query), False))
else: return '(' + ''.join(chars) + ')'
[docs]def motif_iter(subject, query, mismatches=0, both_strands=False, case_independent=True, only_base=True):
"""
Iterate over hits of a given query.
Return an iterator over hits of the provided query in the subject
sequence. The query should only contain bases (including IUPAC
ambiguity characters as listed in :ref:`here <iupac-nomenclature>`, but
excluding ``-`` and ``?``). Ambiguity characters are treated as
described in :func:`.regex` (the bottom line is that ambiguities in
the query can only match identical or less degenerate ambiguities in
the subject). Mismatches are allowed, and the iterator will first
yield identical hits (if they exist), then hits with one mismatch,
then hits with two mismatches, and so on.
:param subject: a DNA sequence as :class:`str` or a :class:`.SequenceView`. The sequence may
contain ambiguity characters. In principle, the subject sequence
should contain IUPAC characters only, but this is not strictly
enforced (non-IUPAC characters will always be treated as
different of IUPAC characters).
:param query: a DNA sequence as :class:`!str` or a :class:`!SequenceView`, also containing
IUPAC characters only.
:param mismatches: maximum number of mismatches.
:param both_strands: look for the query on both forward and reverse
strands (by default, only on forward strand).
:param case_independent: if ``True``, perform case-independent
searches.
:param only_base: if ``True``, never create a hit if the putative
hit sequence contains an alignment gap (``-``) or a ``?``
character, irrespective of the number of allowed mismatches.
:return: An iterator returning, for each hit, a
``(start, stop, strand, num)`` tuple where ``start`` and ``stop``
are the position of the hit such that ``subject[start:stop]``
returns the hit sequence, ``strand`` is the strand as ``+`` or
``-``, and ``num`` is the number of mismatches of the hit.
.. note::
If a given query has a hit on both strands at the same position
(this only happens if the sequence is a motif equal to its
reverse-complement like ``AATCGATT``), it is guaranteed that the
only one hit will be returned for a given position (no twice on
each strand). However, the value of the ``strand`` flag is not
defined.
.. warning::
This function is designed to be efficient only if the number of
mismatches is small (only a few). A combination of a large
number of mismatches (more than 2 or 3) and a large query
sequence (as early as 10 or 20), will take significant time to
complete. More complex problems require the use of genuine
pairwise ocal alignment tools.
"""
if isinstance(subject, _interface.SequenceView): subject = subject.string()
if isinstance(query, _interface.SequenceView): query = query.string()
# get query length
ln = len(query)
if ln < 1: raise ValueError('query must not be empty')
if not _bases_set.issuperset(query): raise ValueError('invalid character in DNA sequence')
if mismatches > ln: raise ValueError('too many mismatches allowed')
# initialize table of positions of mismatches
# if n>1, mismatch indexes are always increasing (first value for n=3 is [0,1,2])
n = 1 # pos cannot be incremented if 0
pos = [-1] # the last (and only) index will be incremented once before use
# current query (initially without mismatches)
cur = list(query)
# cache to avoid returning several times the same hit
cache = set()
# process all possibilities (no mismatches, then 1, then 2, etc.)
while True:
# perform search (will be done at least once)
for mo in re.finditer(regex(''.join(cur), both_strands), subject, re.I if case_independent else 0):
if mo.start() not in cache:
yield mo.start(), mo.end(), '+-'[mo.lastindex-1], 0 if pos[0]==-1 else n
cache.add(mo.start())
# increment mismatch index (initialized to -1)
i = n -1 # current mismatch index
pos[i] += 1 # increment last index
stop = ln # stop value (maximum pos for n=3 and ln=10 is [7,8,9])
# increment previous index if end is reached (and so one)
while pos[i] == stop:
# all mismatches processed, add a new mismatch
if i == 0:
n += 1
pos = list(range(n))
break
# increment previous index
else:
i -= 1
stop -= 1
pos[i] += 1
pos[i+1:] = range(pos[i]+1, pos[i]+n-i)
# quit if already too many mismatches
if n > mismatches: break
# create a list from the query with allowed mismatches (at least 1)
cur = list(query)
for i, v in enumerate(pos):
cur[v] = 'N' if only_base else '?'
def _get_gaps(gaps, alph):
# helper to validate gap option
if gaps is None:
gap_codes = _gap_codes.get(alph.name, None)
if gap_codes is None: raise ValueError('`gap` argument is required for non-standard alphabets')
else:
if not isinstance(gaps, (list, tuple)):
raise TypeError('invalid type for gaps')
if len(gaps) == 0:
raise ValueError('at least one gap value is expected')
try:
gap_codes = list(map(alph.get_code, gaps))
except ValueError:
raise ValueError('invalid gap value for the input alphabet')
return gap_codes
[docs]def ungap_all(obj, gaps=None):
"""
Remove all gaps from an object.
Generate a :class:`.Container` instance containing all sequences of
the provided object excluding any gaps.
:param obj: input object as an :class:`.Align` or a :class:`.Container`.
:param gaps: the list of allelic value(s) representing gaps. The
argument must be a :class:`list` or :class:`tuple` containing
one gap allele value, or more than one if appropriate. All
values must be valid alleles for the input alphabet. By
default, use the ``-`` character for the DNA and protein
alphabets, and all triplets containing at least one alignment
gaps for the codon alphabet. There is no default for other
alphabets.
:return: A :class:`!Container` instance.
"""
if not isinstance(obj, (_interface.Align, _interface.Container)): raise TypeError('expect an Align or a Container instance')
gap_codes = _get_gaps(gaps, obj._alphabet)
ls = obj.ls
result = _interface.Container(alphabet=obj._alphabet)
for i in range(obj.ns):
result.add_sample(
obj.get_name(i),
[obj._alphabet.get_value(obj._obj.get_sample(i, j)) for j in range(ls) if obj._obj.get_sample(i, j) not in gap_codes],
obj.get_sample(i).labels)
return result
[docs]def ungap(align, freq, gaps=None):
"""
Remove sites with too many gaps.
Generate a new :class:`.Align` instance containing all sequences of
the provided alignment but with only those sites for which the
frequency of alignment gaps is less than or equal to *freq*.
:param align: input alignment as an :class:`!Align`.
:param freq: maximum gap frequency in a site (if there are more
gaps, the site is not included in the returned :class:`.Align`
instance). This value is a
relative frequency (included in the [0, 1] range).
:param gaps: the list of allelic value(s) representing gaps. The
argument must be a :class:`list` or :class:`tuple` containing
one gap allele value, or more than one if appropriate. All
values must be valid alleles for the alignment alphabet. By
default, use the ``-`` character for the DNA and protein
alphabets, and all triplets containing at least one alignment
gaps for the codon alphabet. There is no default for other
alphabets.
:return: A new :class:`!Align` instance.
"""
# process arguments
if not isinstance(align, _interface.Align): raise TypeError('expect an Align instance')
gap_codes = _get_gaps(gaps, align._alphabet)
# get statistics
ls = align.ls
if freq < 0.0 or freq > 1.0: raise ValueError('`freq` value is out of range')
ns = align.ns
if ns == 0: raise ValueError('not enough sequences in alignment')
# collect retained positions
good = []
for i in range(ls):
gaps = 0
for j in range(ns):
if align._obj.get_sample(j, i) in gap_codes: gaps += 1
if gaps/ns <= freq:
good.append(i)
# return requested object
return align.extract(good)