Source code for egglib.tools._code_tools

"""
    Copyright 2008-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import re, itertools
from .. import eggwrapper as _eggwrapper
from .. import _interface, alphabets
from . import _reading_frame, _seq_manip

# preload all genetic codes
_codes = {}
for i in range(_eggwrapper.GeneticCode.num_codes()):
    code = _eggwrapper.GeneticCode(i)
    key = code.get_code()
    _codes[key] = code


[docs]
class Translator(object):
    """
    Class to translate nucleotide sequences to proteins.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.
    """

    def _help_frame_processor(self, ls, frame):
        # take the frame argument and ls and generate self._frame and self._naa
        if frame is None:
            if ls == 0: self._frame = _reading_frame.ReadingFrame([])
            elif ls % 3 != 0: raise ValueError('alignment length must be a multiple of 3')
            else: self._frame = _reading_frame.ReadingFrame([(0, ls)])
        else:
            if frame.num_needed_bases > ls:
                raise ValueError('reading frame is extending past the end of the sequence')
            self._frame = frame
        self._naa = self._frame.num_codons

    def __init__(self, code=1):
        if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code))
        self._code = _codes[code]


[docs]
    def translate_codon(self, codon):
        """
        Translate a single codon. Translation is based on the 
        genetic code defined at construction time. Alternative start 
        codons are not supported.

        :param codon: codon as a three-character string
        :return: The one-letter amino acid code if the codon can be
            translated, or ``-`` if the codon is ``---``, or ``X`` in all
            other cases
            (including all cases with invalid nucleotides).
        """
        return alphabets.protein.get_value(
                    self._code.translate(
                        alphabets.codons.get_code(codon)))



[docs]
    def translate_sequence(self, sequence, allow_alt=False):
        """
        Translate a sequence.

        :param sequence: a :class:`str`, :class:`~.SequenceView` or
            compatible instance containing codon sequences. Only
            upper-case strings.

        :param frame: a :class:`~.ReadingFrame` instance providing the
            exon positions in the correct frame. By default, a
            non-segmented frame covering all sequences is assumed (in
            case the provided alignment is the coding region and length
            must be a multiple of 3).

        :param allow_alt: a boolean telling whether alternative start
            (initiation) codons should be considered. If ``False``,
            codons are translated as a methionine (M) if, and only if,
            there are among the alternative start codons for the considered
            genetic code and they appear at the first position for the
            sequence.

        :return: A new :class:`str` instance containing translated
            (protein) sequences.
        """
        if isinstance(sequence, str):
            if len(sequence) % 3 != 0: raise ValueError('number of bases must be a multiple of 3')
            sequence = re.findall('...', sequence.upper())
        if allow_alt:
            for i, codon in enumerate(sequence):
                if codon == '---': continue
                if self._code.start(alphabets.codons.get_code(codon)):
                    start = i
                    break
                if alphabets.codons.get_code(codon) < 0:
                    start = None
                    break
            else: start = None
        dest = [self._code.translate(alphabets.codons.get_code(codon)) for codon in sequence]
        if allow_alt and start is not None and dest[start] != 10: dest[start] = 10
        return ''.join(map(alphabets.protein.get_value, dest))



[docs]
    def translate_align(self, src, allow_alt=False, in_place=False):
        """
        Translate an :class:`.Align` instance.

        :param src: an :class:`!Align` containing codon sequences and
            using the :py:obj:`.alphabets.codons` alphabet.

        :param allow_alt: a boolean telling whether alternative start
            (initiation) codons should be considered. If ``True``,
            codons are translated as a methionine (M) if
            they are among the alternative start codons for the considered
            genetic code and they appear at the first position for the
            considered sequence (excluding all triplets of gap symbols
            appearing at the 5' end of the sequence).

        :param in_place: place translated sequences into the original
            :class:`!Align` instance (this discards original data and
            replaces the original alphabet by the protein alphabet). By
            default, returns a new instance.

        :return: By default, an original :class:`!Align` instance
            containing translated (protein) sequences. If *in_place* was
            ``True``, return ``None``.
        """

        # check input
        if not isinstance(src, _interface.Align): raise TypeError('expect an Align instance')
        if src._alphabet._obj.get_type() != 'codons': raise ValueError('alphabet must be codons')
        ns = src.ns
        ls = src.ls

        # create destination
        if not in_place:
            dest = _interface.Align(nsam=ns, nsit=ls, alphabet=alphabets.protein)
            for i in range(ns):
                dest.set_name(i, src.get_name(i))
                dest._obj.set_nlabels(i, src._obj.get_nlabels(i))
                for j in range(src._obj.get_nlabels(i)):
                    dest.set_label(i, j, src.get_label(i, j))
        else:
            dest = src
            dest._alphabet = alphabets.protein

        # iterate over sequences
        for i in range(ns):
            if allow_alt:
                for j in range(ls):
                    codon = src._obj.get_sample(i, j)
                    if codon == -1165: continue
                    if self._code.start(codon):
                        start = j
                        break
                    if codon < 0:
                        start = None
                        break
                else: start = None
            for j in range(ls):
                dest._obj.set_sample(i, j, self._code.translate(src._obj.get_sample(i, j)))
            if allow_alt and start is not None and dest._obj.get_sample(i, start) != 10: dest._obj.set_sample(i, start, 10)

        # return
        if not in_place: return dest



[docs]
    def translate_container(self, src, allow_alt=False, in_place=False):
        """
        Translate a :class:`.Container` instance.

        :param align: a :class:`!Container` containing codons sequences.

        :param allow_alt: a boolean telling whether alternative start
            (initiation) codons should be considered. If ``False``,
            codons are translated as a methionine (M) if, and only if,
            they are among the alternative start codons for the considered
            genetic code and they appear at the first position for the
            considered sequence.

        :param in_place: place translated sequences into the original
            :class:`!Container` instance (this discards original data).
            By default, return a new instance.

        :return: By default, a new :class:`!Container` instance
            containing translated (protein) sequences. If *in_place* was
            ``True``, return ``None``.
        """

        # check input
        if not isinstance(src, _interface.Container): raise TypeError('expect a Container instance')
        if src._alphabet._obj.get_type() != 'codons': raise ValueError('alphabet must be codons')
        ns = src.ns

        # create destination
        if not in_place:
            dest = _interface.Container(alphabet=alphabets.protein)
            dest._obj.set_nsam(ns)
            dest._ns = ns
            for i in range(ns):
                dest.set_name(i, src.get_name(i))
                dest._obj.set_nlabels(i, src._obj.get_nlabels(i))
                for j in range(src._obj.get_nlabels(i)):
                    dest.set_label(i, j, src.get_label(i, j))
        else:
            dest = src
            dest._alphabet = alphabets.protein

        # iterate over sequences
        for i in range(ns):
            ls = src._obj.get_nsit_sample(i)
            if not in_place: dest._obj.set_nsit_sample(i, ls)
            if allow_alt:
                for j in range(ls):
                    codon = src._obj.get_sample(i, j)
                    if codon == -1165: continue
                    if self._code.start(codon):
                        start = j
                        break
                    if codon < 0:
                        start = None
                        break
                    else: start = None
            for j in range(ls):
                dest._obj.set_sample(i, j, self._code.translate(src._obj.get_sample(i, j)))
            if allow_alt and start is not None and dest._obj.get_sample(i, start) != 10: dest._obj.set_sample(i, start, 10)

        # return
        if not in_place: return dest




[docs]
def translate(seq, code=1, allow_alt=False, in_place=False):
    """
    Translates coding nucleotide sequences to proteins. See the class
    :class:`.tools.Translator` for more details. This is a convenience method
    allowing to translate nucleotide sequences in a single call. For
    repetitive calls, direct use of :class:`!Translator` can be more
    efficient.

    :param seq: input codon sequences. Accepted types are
        :class:`.Align`, :class:`.Container`, :class:`.SequenceView`
        or :class:`str` (or also types compatible with :class:`str`).
        If the input object is an :class:`!Align` or a :class:`!Container`,
        the alphabet must be :py:obj:`.alphabets.codons`.

    :param code: genetic code identifier (see below).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :param allow_alt: a boolean telling whether alternative start
        (initiation) codons should be considered. If ``True``,
        codons are translated as a methionine (M) if, and only if,
        there are among the alternative start codons for the considered
        genetic code and they appear at the first position for the
        sequence. If *seq* is an :class:`!Align`, leading gaps are
        ignored as long as they appear as multiples of 3 (fully missing codons).

    :param in_place: place translated sequences in the provided
        :class:`!Align` or :class:`!Container` instance, overwriting
        initial data. Not allowed if *seq* is not of one of these two
        types. Replace the original alphabet by :py:obj:`.alphabets.protein`.
        By default, return a new instance.

    :return: Protein sequences as an :class:`!Align` or a
        :class:`!Container` if either of these types have been provided
        as *seq*, or as a string otherwise. If *in_place* has been
        specified, return ``None``.

    .. _genetic-codes:

    All genetic codes defined by the National Center for Biotechnology
    Information are supported and can be accessed using codes corresponding
    to the GenBank ``/trans_table`` qualifier. The codes are integers.

    .. include:: list_genetic_codes.txt

    Reference: National Center for Biotechnology Information
    [`<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_]
   """

    T = Translator(code=code)
    if isinstance(seq, _interface.Align):
        return T.translate_align(seq, allow_alt=allow_alt, in_place=in_place)
    elif isinstance(seq, _interface.Container):
        return T.translate_container(seq, allow_alt=allow_alt, in_place=in_place)
    else:
        if in_place == True: raise ValueError('cannot translate sequence in place')
        return T.translate_sequence(seq, allow_alt=allow_alt)



[docs]
class orf_iter:
    """
    Iterate over open reading frames.
    Return an iterator over non-segmented open reading frames (ORFs)
    found in a provided codon sequence in any of the six possible frames.

    :param sequence: a string or a :class:`.SequenceView` representing a codon sequence.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :param min_length: minimum length of returned ORFs. This value must
        be at least 1. It is understood as the length of the encoded
        peptide (since ORFs are returned as nucleotide sequences, they
        will have a length of least three times this value).

    :param forward_only: consider only the three forward frames (do not
        consider the reverse strand).

    :param force_start: if ``True``, all returned ORFs are required to
        start with a start codon. Otherwise, all non-stop codons are
        included in ORFs. Note that alternative start codons (``CTG``
        and ``TTG`` for the standard genetic code) are also supported.

    :param allow_alt: allow alternative start codons (only considered
        if *force_start* is set).

    :param force_stop: require that ORFs
        end with a stop codon (this only excludes 3'-partial ORFs, that
        is ORFs that end with the end of the provided sequennce).

    :return: An iterator over all detected ORFs. Each ORF is represented
        by a ``(start, stop, length, frame)`` tuple where ``start`` is the
        start position of the ORF and ``stop`` the stop position (such as
        ``sequence[start:stop]`` returns the ORF sequence or its reverse
        complement [note that the stop codon, if present, is included]),
        ``length`` is the ORF length (number of amino acids, excluding any
        stop codon) and ``frame`` is the reading frame on which it was found: +1, +2,
        +3 are the frames on the forward strand (starting respectively at
        the first, second, and third base), and -1, -2, -3 are the frames
        on the reverse strand (starting respectively at the last, last
        but one, and last but two bases).
    """

    def __init__(self, sequence, code=1, min_length=1,
                 forward_only=False, force_start=True, allow_alt=False,
                 force_stop=True):

        # get/check arguments
        if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code))
        code = _codes[code]
        if min_length < 1: raise ValueError('value for `min_length` is too small')
        self._ls = len(sequence)
        self._sequence = sequence
        self._code = code
        self._min_length = min_length
        self._forward_only = forward_only
        self._force_start = force_start
        self._force_stop = force_stop
        self._frames = [None] * 6

        # make a list of codon status (1:start/2:stop/3:valid/4:missing)
        self._codon_table = {}
        aa_stop = alphabets.protein.get_code('*')
        valid, missing = alphabets.codons.get_alleles()
        for cod in valid:
            c = alphabets.codons.get_code(cod)
            if cod == 'ATG' or (allow_alt and code.start(c)):
                self._codon_table[cod] = 1
            elif code.translate(c) == aa_stop:
                self._codon_table[cod] = 2
            else:
                self._codon_table[cod] = 3
        for cod in missing:
            self._codon_table[cod] = 4

    def __iter__(self):
        self._pos = 0
        self._iter_f = itertools.cycle([1, 2, 3])
        self._iter_r = itertools.cycle([-1, -3, -2])
        if self._ls % 3 == 1:
            next(self._iter_r)
            next(self._iter_r)
        elif self._ls % 3 == 2:
            next(self._iter_r)
        self._cache = None
        return self

    def __next__(self):

        # return cache frame if any
        if self._cache:
            r = self._cache
            self._cache = None
            return r

        # proceed until an ORF (or the end) is found
        while True:

            # if reached end of sequence, process hanging ORFs and finish
            if self._pos > self._ls-3:
                return self.finishing() # ultimately raises StopIteration

            orf_f = None
            orf_r = None

            # process forward frame
            F = next(self._iter_f)
            cod = self._codon_table[self._sequence[self._pos:self._pos + 3]]
            if self._frames[F-1] is None:
                if (self._force_start == True and cod == 1) or (self._force_start == False and cod in [1, 3]):
                    self._frames[F-1] = [self._pos, None, None, F]
            else:
                if cod == 2:
                    self._frames[F-1][1] = self._pos + 3
                    self._frames[F-1][2] = (self._pos + 3 - self._frames[F-1][0]) // 3 - 1 # -1 because of stop
                    if self._frames[F-1][2] >= self._min_length: orf_f = tuple(self._frames[F-1])
                    self._frames[F-1] = None

            # process reverse frame
            if not self._forward_only:
                R = next(self._iter_r)
                cod = self._codon_table[_seq_manip.rc(self._sequence[self._pos:self._pos + 3])]
                if self._frames[2-R] is None:
                    if cod == 2:            # start, stop, len, frame, final_stop, last_M, has_non_stop
                        self._frames[2-R] = [self._pos, None, None, R, 1, None, False]
                    elif cod != 4 and not self._force_stop:
                        self._frames[2-R] = [self._pos, None, None, R, 0, self._pos + 3 if cod==1 else None, True]
                else:
                    if self._force_start == True and cod == 1:
                        self._frames[2-R][5] = self._pos + 3
                        self._frames[2-R][6] |= True
                    elif cod == 2:
                        if self._force_start:
                            if self._frames[2-R][5] is not None: # there is an M
                                self._frames[2-R][1] = self._frames[2-R][5]
                                self._frames[2-R][2] = (self._frames[2-R][5] - self._frames[2-R][0]) // 3 - self._frames[2-R][4]
                                if self._frames[2-R][2] >= self._min_length: orf_r = tuple(self._frames[2-R][:4])
                        elif self._frames[2-R][6]:
                            self._frames[2-R][1] = self._pos
                            self._frames[2-R][2] = (self._pos - self._frames[2-R][0]) // 3  - self._frames[2-R][4]
                            if self._frames[2-R][2] >= self._min_length: orf_r = tuple(self._frames[2-R][:4])
                        self._frames[2-R] = [self._pos, None, None, R, 1, None, False]
                    elif cod != 4:
                        self._frames[2-R][6] |= True

            # return frame (return orf_f and cache orf_r if both)
            self._pos += 1
            if orf_f and orf_r:
                self._cache = orf_r
                return orf_f
            if orf_f:
                return orf_f
            if orf_r:
                return orf_r

    def finishing(self):
        # if we reached the end, return any remaining still-running ORF (if allowed by options)

        # forward frames
        if not self._force_stop:
            for i in range(3):
                if self._frames[i] is not None:
                    self._frames[i][1] = (self._ls - i) // 3 * 3 + i
                    self._frames[i][2] = (self._frames[i][1] - self._frames[i][0]) // 3
                    r = self._frames[i]
                    self._frames[i] = None
                    if r[2] >= self._min_length: return tuple(r)

        # reverse frames
        for i in range(3):
            if self._frames[3+i] is not None:
                if self._force_start:
                    if self._frames[3+i][5] is None:
                        self._frames[3+i] = None
                        continue
                    self._frames[3+i][1] = self._frames[3+i][5]
                    self._frames[3+i][2] = (self._frames[3+i][5] - self._frames[3+i][0]) // 3 - self._frames[3+i][4]
                elif self._frames[3+i][6]:
                    self._frames[3+i][1] = self._ls - i
                    self._frames[3+i][2] = (self._frames[3+i][1] - self._frames[3+i][0]) // 3 - self._frames[3+i][4]
                if self._frames[3+i][6]:
                    r = self._frames[3+i][:4]
                    self._frames[3+i] = None
                    if r[2] >= self._min_length: return tuple(r)
                else:
                    self._frames[3+i] = None

        raise StopIteration



[docs]
def longest_orf(*args, **kwargs):
    """
    longest_orf(sequence, code=1, min_length=1,
                 forward_only=False, force_start=True, allow_alt=False,
                 force_stop=True)

    Detect the longest open reading frame. Arguments are identical to :func:`.orf_iter`.
    A :exc:`ValueError` is raised if two or more open
    reading frames have the largest length.

    :return: A ``(start, stop, length, frame)`` tuple (see the return
        value of :func:`.orf_iter` for details), or ``None`` if no open
        reading frame fits the requirements.
    """
    max_length = 0
    candidate = None
    for start, stop, length, frame in orf_iter(*args, **kwargs):
        if length == max_length:
            candidate = None
            num += 1
        elif length > max_length:
            max_length = length
            candidate = start, stop, length, frame
            num = 1

    if max_length == 0: return None
    if num > 1: raise ValueError('several equally long ORFs found')
    return candidate



[docs]
def backalign(nucl, aln, code=1, ignore_names=False, ignore_mismatches=False, fix_stop=False, allow_alt=True):
    """
    Align coding sequence based on the protein alignment.

    :param nucl: a :class:`.Container` or :class:`.Align` instance
        containing coding sequences that should be aligned. There should
        not be any alignment gap in the sequences. The alphabet must be
        :class:`.alphabets.codons`.

    :param aln: an :class:`!Align` instance containing an alignment of
        the protein sequences encoded by the coding sequences provided
        as *nucl*. Group labels of *aln*
        are not taken into account. The alphabet must be :class:`.alphabets.protein`.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :param ignore_names: if ``True``, ignore names for matching
        sequences in the protein alignment to coding sequences.
        Sequences will be matched using their rank and the names in the
        returned alignment will be taken from *nucl*.

    :param ignore_mismatches: if ``True``, do not generate any exception
        if a predicted protein does not match the provided protein
        sequence (if the lengths differ, an exception is always raised).

    :param fix_stop: if ``True``, support a single trailing stop codon
        in coding sequences not represented by a ``*`` in the provided
        protein alignment (if the final stop codons have been
        stripped during alignment). If found, this stop codon will be
        flushed as left as possible (immediately after the last non-gap
        character) in the returned coding alignment.

    :return: An :class:`.Align` instance containing aligned coding codon
        sequences.

    If a mismatch is detected between a protein from *aln* and the
    corresponding prediction from *nucl*, an instance of
    :exc:`.tools.BackalignError` (a subclass of :exc:`ValueError`)
    is raised. Its attribute :py:obj:`~.tools.BackalignError.alignment` can be
    used to help identify the reason of the error. Mismatches (but not
    differences of length) can be ignored with the option
    *ignore_mismatches*.
    """
    # checking
    if not isinstance(aln, _interface.Align): raise TypeError('expect an Align instance')
    if aln._alphabet._obj.get_name() != 'protein': raise ValueError('alignment alphabet must be protein')
    if not isinstance(nucl, _interface.Container): raise TypeError('argument must be a Container instance')
    if nucl._alphabet._obj.get_type() != 'codons': raise ValueError('alignment alphabet must be codons')

    # get code
    translator = Translator(code=code)

    # initialize the return object
    ns = aln.ns
    ls = aln.ls
    result = _eggwrapper.DataHolder(True)
    result.set_nsam(ns)
    result.set_nsit_all(ls + int(fix_stop))

    # set a trailing gape at the end of all sequences to allow 1 or more additional stop codons
    if fix_stop:
        for i in range(ns): result.set_sample(i, ls, -1165)
        last_used = False # set to True if at least one sequence uses the last three bases

    # get mapping of indexes
    nucl = nucl._obj
    aln = aln._obj
    if nucl.get_nsam() != ns: raise ValueError('numbers of sequences don\'t match')
    names = list(map(nucl.get_name, range(ns)))
    if not ignore_names:
        if len(set(names)) < ns: raise ValueError('duplicate name found')
        aln_names = list(map(aln.get_name, range(ns)))
        ranks = []
        for name in names:
            try: ranks.append(aln_names.index(name))
            except ValueError: 'sequence name not found in protein alignment: {0}'.format(name)
    else:
        ranks = list(range(ns))

    # set names and group labels
    for idx_cds in range(ns):
        result.set_name(idx_cds, names[idx_cds])
        result.set_nlabels(idx_cds, nucl.get_nlabels(idx_cds))
        for k in range(nucl.get_nlabels(idx_cds)):
            result.set_label(idx_cds, k, nucl.get_label(idx_cds, k))

    # main loop
    last_used = False
    for idx_cds, idx_aln in enumerate(ranks):
        ls_nucl = nucl.get_nsit_sample(idx_cds)
        if ls_nucl < 1: raise ValueError('empty nucleotide sequence')

        # process all aligned aa positions
        c = 0
        for idx in range(ls):
            aa = aln.get_sample(idx_aln, idx)

            # process gaps
            if aa == -1: result.set_sample(idx_cds, idx, -1165)

            # non-gap
            else:
                codon = nucl.get_sample(idx_cds, c)
                c += 1
                if (not ignore_mismatches and translator._code.translate(codon) != aa): # check mismatch
                    raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator)
                result.set_sample(idx_cds, idx, codon) # set codon

        # if there is an additional codon at the end
        if c != ls_nucl:

            # if requested, check that it is a stop codon
            if fix_stop:
                codon = nucl.get_sample(idx_cds, c)
                c += 1

                # fix stop codon (only if all bases have been read)
                if c == ls_nucl and translator._code.translate(codon) == 20:
                    i = ls # there is necessarily room there, filled by gaps
                    while i > 0 and result.get_sample(idx_cds, i-1) == -1165: i -= 1
                    result.set_sample(idx_cds, i, codon)
                    if i == ls: last_used = True
                else: raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator)
            else: raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator)

    # if the last codon position has not been used at least once by fix_stop, remove them
    if fix_stop and not last_used:
        result.set_nsit_all(ls) # decrease by three

    # that's all, folks!
    return _interface.Align._create_from_data_holder(result, alphabets.codons)



[docs]
class BackalignError(ValueError):
    """
    Exception used to report errors
    occurring during the use of :func:`.tools.backalign` because of mismatches
    between the provided alignment and predicted proteins.
    """

    def __init__(self, name, fnuc, faln, i_nuc, i_aln, ls_aln, ls_nuc, translator):
        self._name = name
        message = 'mismatch between provided and predicted proteins for `{0}`'.format(name)
        prov = ''.join(map(alphabets.protein.get_value, [faln(i_aln, i) for i in range(ls_aln)])).replace('-', '')
        pred = ''.join(map(alphabets.codons.get_value, [fnuc(i_nuc, i) for i in range(ls_nuc)])).replace('-', '')
        if len(pred) % 3 != 0: message = 'length of nucleotide sequence not a multiple of 3 for {0}'.format(name)
        pred = ''.join([translator.translate_codon(''.join(codon)) for codon in zip(pred[::3], pred[1::3], pred[2::3])])

        n = max([len(prov), len(pred)])

        mid = []
        for i in range(n):
            if i >= len(prov):
                prov += '-'
                mid.append('~')
            elif i >= len(pred):
                pred += '-'
                mid.append('~')
            elif prov[i] != pred[i]:
                mid.append('#')
            else:
                mid.append('|')
        mid = ''.join(mid)

        c = 0
        self._alignment = []
        while True:
            A = prov[c:c+60]
            A = ' '.join([A[i*10:i*10+10] for i in range(6)])
            B = mid[c:c+60]
            B = ' '.join([B[i*10:i*10+10] for i in range(6)])
            C = pred[c:c+60]
            C = ' '.join([C[i*10:i*10+10] for i in range(6)])
            self._alignment.append('[provided]  {0}'.format(A) + '[{0}]\n'.format(c+60).rjust(7))
            self._alignment.append('            {0}\n'.format(B))
            self._alignment.append('[predicted] {0}'.format(C) + '[{0}]\n'.format(c+60).rjust(7))
            c += 40
            if c < n: self._alignment.append('\n')
            else: break

        self._alignment = ''.join(self._alignment)
        ValueError.__init__(self, message)

    @property
    def name(self):
        """
        Name of sequence for which the error occurred.
        """
        return self._name

    @property
    def alignment(self):
        """
        Comparaison of provided and predicted proteins.
        This string shows the two proteins with a middle line composed
        of the following symbols:

        * ``|``: match.
        * ``#``: mismatch.
        * ``~``: one protein shorter.
        """
        return self._alignment



[docs]
def trailing_stops(align, action=0, code=1, replacement='???'):
    """
    Process stop codons at the end of the sequences. This function
    detects and (optionally) fix trailing stop codons.
    The algorithm consists in locating the last non-gap codon of each
    sequence. If the last codon is partially gapped (which can happen
    if coding sequences are aligned using a DNA-based algorithm), it
    will not be detected and ignored.

    :param align: an :class:`.Align` containing aligned coding
        sequences. The alphabet must be :py:obj:`.alphabets.codons`.

    :param action: an integer specifying what should be done if a stop
        codon is found at the end of a given sequence. Possible actions
        are listed in the following table:

        +------+---------------------------------------------------+
        | Code | Action                                            +
        +======+===================================================+
        | 0    | Nothing (just count them).                        +
        +------+---------------------------------------------------+
        | 1    | Replace them by a gap, and delete the last        +
        |      | position if it is made by gaps only.              +
        +------+---------------------------------------------------+
        | 2    | Replace them by the value given as *replacement*. +
        +------+---------------------------------------------------+

        Note that using ``action=1`` is not stricly equivalent to using
        ``action=2, replacement='---'`` because the former deletes the
        last three positions of the alignment if needed while the latter
        does not.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :param replacement: if *action* is set to 2, provide the codon
        that should be used to replace stop codons.

    :return: The number of sequences that had a trailing stop codons
        among the considered sequences.
    """

    # argument check
    if not isinstance(align, _interface.Align): raise TypeError('expect an Align instance')
    if align._alphabet.type != 'codons': raise ValueError('alignment alphabet must be codons')
    if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code))
    code = _codes[code]
    ns = align.ns
    ls = align.ls
    try: replacement = alphabets.codons.get_code(replacement)
    except ValueError: raise ValueError('invalid codon: {0}'.format(replacement))
    cnt = 0
    num_gapped = 0

    for i in range(ns):
        cur = ls - 1
        for cur in range(ls-1, -1, -1):
            if align._obj.get_sample(i, cur) != -1165:
                if code.translate(align._obj.get_sample(i, cur)) == 20:
                    cnt += 1
                    if action == 1:
                        align._obj.set_sample(i, cur, -1165)
                        num_gapped += 1
                    elif action == 2:
                        align._obj.set_sample(i, cur, replacement)
                break

    # remove final 3 gaps (only if ALL samples where replaced by gaps, in particular only if include_outgroup was true)
    if action == 1 and num_gapped == ns:
        align._obj.set_nsit_all(ls-3)

    return cnt



[docs]
def iter_stops(src, code=1):
    """
    Iterate over stop codons.
    Return an iterator providing the coordinates of all stop codons
    over all sequences. Each iteration returns a
    ``(sample, position)`` where ``sample`` is the sample index and
    ``position`` is the index of the stop codon.

    :param src: an :class:`.Align` or :class:`.Container` containing
        coding sequences. The alphabet must be :py:obj:`.alphabets.codons`.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :return: An iterator over the ``(sample, position)`` tuples
        corresponding to each stop codon found in the alignment (see
        above).
    """

    # argument check
    if not isinstance(src, (_interface.Align, _interface.Container)): raise TypeError('expect an Align or Container instance')
    if src._alphabet.type != 'codons': raise ValueError('alignment alphabet must be codons')
    ns = src.ns
    if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code))
    code = _codes[code]
    if isinstance(src, _interface.Align): ls = src.ls

    for i in range(ns):
        if isinstance(src, _interface.Container): ls = src.ls(i)
        for j in range(ls):
            if code.translate(src._obj.get_sample(i, j)) == 20:
                yield (i, j)



[docs]
def has_stop(src, code=1):
    """
    Test if there is at least one stop codon.
    Return ``True`` if the sequences contain at least one codon stop
    at any position in any sequence, and ``False`` otherwise.

    :param src: an :class:`.Align` or :class:`.Container` containing
        aligned coding sequences.

    :param code: genetic code identifier (see :ref:`here <genetic-codes>`).
        Required to be an integer among the valid values. The default
        value is the standard genetic code.

    :return: A boolean.
    """
    for stop in iter_stops(src, code=code): break
    else: return False
    return True