Source code for egglib.wrappers._blast

"""
    Copyright 2009-2023 Stephane De Mita, Mathieu Siol, Thomas Coudoux

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

# cf: https://www.ncbi.nlm.nih.gov/books/NBK279684/
# see also: https://doctorlib.info/medical/blast/7.html (checked for strands)

import subprocess, os, collections, sys
import xml.etree.ElementTree
from .. import _interface, alphabets
from ..tools import _code_tools
from . import _utils

class _BLAST_tool(_utils._App):
    def _check_path(self, path, cfg):
        cmd = (path, '-version')
        try:
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            stdout, stderr = p.communicate()
            if len(stderr): return stderr
        except OSError as e:
            return e.strerror

_blast_apps = {}
for k in 'makeblastdb', 'blastn', 'blastp', 'blastx', 'tblastn', 'tblastx':
    _blast_apps[k] = _BLAST_tool(key=k, default=k)
    _utils.paths._add(_blast_apps[k])


[docs]
def makeblastdb(source, dbtype=None, out=None, input_type="fasta", verbose=False,
    title=None, parse_seqids=False, hash_index=False, mask_data=None, mask_id=None,
    mask_desc=None, blastdb_version=5,
    max_file_sz="1GB", taxid=None, taxid_map=None):
    """
    Create a BLAST database.

    :param source: name of an input file of the appropriate format.
        If not fasta, the format must be specified using the *input_type*
        option. Alternatively, *source* can be a :class:`.Container` or
        :class:`.Align` instance. If so, its alphabet must be DNA or protein
        and the *dbtype* argument, if specified, must match. Note that
        passing a :class:`!Container` or :class:`!Align` instance must be
        avoided for large databases.

    :param dbtype: database type: ``"nucl"`` or ``"prot"`` are acceptable. Can
        be omitted if a :class:`!Container` or an :class:`!Align` is provided
        as *source*.

    :param out: database name. Must be specified if a :class:`!Container`
        or an :class:`!Align` is provided as *source*, or if the *input_type*
        is "blastdb", otherwise the input file name is used as database name.

    :param input_type: format of input file. Must be ``"fasta"`` if
        a :class:`!Container` or an :class:`!Align` is provided as *source*.
        Otherwise must describe the format of *source*: ``"fasta"``, ``"asn1_bin"``,
        ``"asn1_txt"``, or ``"blastdb"``.

    :param verbose: display makeblastdb output (by default, it is returned
        by the function). Errors are always displayed.

    :param title: database title. A default title is inserted in case a
        :class:`!Container` or :class:`!Align` instance is passed as *source*.

    :parse_seqids: parse seqid from sequence names (considered if `input_type` is fasta,
        including if a :class:`!Container` or an :class:`!Align` is provided
        as *source*; argument ignored otherwise: seqid is always imported).

    :hash_index: create index of sequence hash values.

    :mask_data: list of input files containing masking data.

    :mask_id: list of strings to uniquely identify the masking algorithm,
              one for each mask file (requires *mask_data*).

    :mask_desc: list of free form strings to describe the masking algorithm
                details, one for each mask file (requires *mask_id*).

    :blastdb_version: version of BLAST database to be created (4 or 5).

    :max_file_sz: maximum file size for BLAST database files.

    :taxid: taxonomy ID to assign to all sequences as an integer
            (incompatible with *taxid_map*).

    :taxid_map: text file mapping sequence IDs to taxonomy IDs
            (requires *parse_seqids*, incompatible with *taxid*).

    :return: Standard output of the program (`None` if *verbose* was ``True``).

    Please refer to the manual of BLAST tools for more details.

    .. versionchanged 3.2::
        *blastdb_version* default value was previously 4. *gi_mask* and
        *gi_mask_name* options are removed.
    """

    # get command name
    path = _blast_apps['makeblastdb'].get_path()
    if path is None:
        raise RuntimeError('makeblastdb program not available -- please configure path')
    cmd = [path]

    # process main arguments arguments
    if isinstance(source, (_interface.Container, _interface.Align)):
        cmd.extend(['-in', '-'])
        if source.alphabet == alphabets.DNA and (dbtype is None or dbtype == 'nucl'): cmd.extend(['-dbtype', 'nucl'])
        elif source.alphabet == alphabets.protein and (dbtype is None or dbtype == 'prot'): cmd.extend(['-dbtype', 'prot'])
        else: raise ValueError('invalid alphabet, dbtype or alphabet/dbtype mismatch')
        if input_type != 'fasta': raise ValueError('`input_type` must be "fasta"')
        if out is None: raise ValueError('`out` is required if an object is passed as `source`')
        cmd.extend(['-input_type', 'fasta'])
        if title is None: title = '{0} database from an EggLib {1}'.format(cmd[4], 'Container' if isinstance(source, _interface.Container) else 'Align')
    else:
        if not isinstance(source, (os.PathLike, str)): raise TypeError('`source` must be a file name, a Container or an Align')
        if input_type != 'blastdb' and not os.path.isfile(source): raise ValueError('file not found: {0}'.format(source))
        if dbtype is None: raise ValueError('`dbtype` is required')
        if dbtype not in ['nucl', 'prot']: raise ValueError('invalid value for `dbtype`')
        if input_type not in ['fasta', 'asn1_bin', 'asn1_txt', 'blastdb']: raise ValueError('invalid value for `input_type`')
        if input_type == 'blastdb' and out is None or out == source: raise ValueError('a different database name is required for input type `blastdb`')
        cmd.extend(['-in', source, '-dbtype', dbtype, '-input_type', input_type])
    if out is not None:
        if not isinstance(out, (os.PathLike, str)): raise TypeError('`out` must be a string or a path-like object')
        cmd.extend(['-out', out])

    # check types of other arguments
    if not isinstance(blastdb_version, int): raise TypeError('`blastdb_version` must be an integer')
    if blastdb_version not in [4, 5]: raise ValueError('supported values for `blastdb_version` are 4 and 5 only')
    if not isinstance(max_file_sz, str): raise TypeError('`max_file_sz` must be a string')
    if title is not None and not isinstance(title, str): raise TypeError('`title` must be a string')
    if not isinstance(parse_seqids, bool): raise TypeError('`parse_seqids` must be a boolean')
    if input_type != 'fasta': parse_seqids = True
    if not isinstance(hash_index, bool): raise TypeError('`hash_index` must be a boolean')
    if mask_data is not None:
        if not isinstance(mask_data, (list, tuple)): raise TypeError('`mask_data` must be a list or a tuple')
        if not all([isinstance(item, str) for item in mask_data]): raise TypeError('`mask_data` items must be strings')
        if not all(map(os.path.isfile, mask_data)): raise ValueError('at least one file specified in `mask_data` does not exist')
        if len(mask_data) == 0: raise ValueError('there must be at least one item in `mask_data`')
    if mask_id is not None:
        if not isinstance(mask_id, (list, tuple)): raise TypeError('`mask_id` must be a list or a tuple')
        if not all([isinstance(item, str) for item in mask_id]): raise TypeError('`mask_id` items must be strings')
    if mask_desc is not None:
        if not isinstance(mask_desc, (list, tuple)): raise TypeError('`mask_desc` must be a list or a tuple')
        if not all([isinstance(item, str) for item in mask_desc]): raise TypeError('`mask_desc` items must be strings')
    if taxid is not None:
        if not isinstance(taxid, int): raise TypeError('`taxid` must be an integer')
        if taxid < 0: raise ValueError('`taxid` must be >= 0')
    if taxid_map is not None:
        if not isinstance(taxid_map, (str, os.PathLike)): raise TypeError('`taxid_map` must be a string or pathlike object')
        if not os.path.isfile(taxid_map): raise ValueError('file passed for `taxid_map` not found: {0}'.format(taxid_map))

    # check compatibility of other arguments
    if mask_id is not None:
        if mask_data is None: raise ValueError('`mask_id` requires `mask_data`')
        if len(mask_id) != len(mask_data): raise ValueError('`mask_id` must have the same length than `mask_data`')
    if mask_desc is not None:
        if mask_id is None: raise ValueError('`mask_desc` requires `mask_id`')
        if len(mask_desc) != len(mask_data): raise ValueError('`mask_desc` must have the same length than `mask_data`')
    if taxid is not None and taxid_map is not None: raise ValueError('`taxid` and `taxid_map` are incompatible')
    if taxid_map is not None and not parse_seqids: raise ValueError('`taxid_map` requires `parse_seqids`')

    # load the options
    if title is not None: cmd.extend(['-title', title])
    if parse_seqids: cmd.append('-parse_seqids')
    if hash_index: cmd.append('-hash_index')
    if mask_data is not None: cmd.extend(['-mask_data', ','.join(mask_data)])
    if mask_id is not None: cmd.extend(['-mask_id', ','.join(mask_id)])
    if mask_desc is not None: cmd.extend(['-mask_desc', ','.join(mask_desc)])
    cmd.extend(['-blastdb_version', str(blastdb_version), '-max_file_sz', max_file_sz])
    if taxid: cmd.extend(['-taxid', str(taxid)])
    if taxid_map: cmd.extend(['-taxid_map', taxid_map])

    # run the program
    p = subprocess.Popen(cmd, stdin=subprocess.PIPE,
            stdout=None if verbose else subprocess.PIPE,
            stderr=subprocess.PIPE, universal_newlines=True)
    if isinstance(source, (_interface.Container, _interface.Align)): stdin = source.fasta()
    else: stdin = None
    stdout, stderr = p.communicate(stdin)
    if p.returncode != 0:
        raise RuntimeError('error while running makeblastdb (message below)\n' + stderr)
    return stdout


_blastn_costs = {
        # r/p     default, allowed
        (1,-2): ( (5,2),   [(5,2), (2,2), (1,2), (0,2), (3,1), (2,1), (1,1)] ),
        (1,-3): ( (5,2),   [(5,2), (2,2), (1,2), (0,2), (2,1), (1,1)] ),
        (1,-4): ( (5,2),   [(5,2), (1,2), (0,2), (2,1), (1,1)] ),
        (2,-3): ( (5,2),   [(4,4), (2,4), (0,4), (3,3), (6,2), (5,2), (4,2), (2,2)] ),
        (4,-5): ( (12,8),  [(12,8), (6,5), (5,5), (4,5), (3,5)] ),
        (1,-1): ( (5,2),   [(5,2), (3,2), (2,2), (1,2), (0,2), (4,1), (3,1), (2,1)] )}

_blastp_costs = {
    'BLOSUM62': [(32767, 32767), (11, 2), (10, 2), (9, 2), (8, 2), (7, 2),
                        (6, 2), (13, 1), (12, 1), (11, 1), (10, 1), (9, 1)],

    'BLOSUM80': [(32767, 32767), (25, 2), (13, 2), (9, 2), (8, 2), (7, 2),
                        (6, 2), (11, 1), (10, 1), (9, 1)],

    'PAM30': [(32767, 32767), (7, 2), (6, 2), (5, 2), (10, 1), (9, 1),
                        (8, 1), (15, 3), (14, 2), (14, 1), (13, 3)],

    'PAM70': [(32767, 32767), (8, 2), (7, 2), (6, 2), (11, 1), (10, 1),
                        (9, 1), (11, 2), (12, 3)]
}

_blastp_costs_defaults = {
    'BLOSUM62': (11, 1),
    'BLOSUM80': (10, 1),
    'PAM70': (10, 1),
    'PAM30': (9, 1)
}


[docs]
def megablast(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10, parse_deflines=False,
        num_threads=1, word_size=28, gapopen=5, gapextend=2, reward=1,
        penalty=-2, strand='both', no_dust=False, no_soft_masking=False,
        lcase_masking=False, perc_identity=0, no_greedy=False):
    """
    ``megablast`` similarity search. This is designed for strongly similar
    sequences using a nucleotide query on a nucleotide database.

    .. include:: blast_options_common
    .. include:: blast_options_blastn
    :return: A :class:`.BlastOutput` instance.

    """
    return _blastn(query, db, subject, 'megablast', query_loc,
                subject_loc, evalue, parse_deflines, num_threads,
                word_size, gapopen, gapextend, reward, penalty, strand,
                no_dust, no_soft_masking, lcase_masking, perc_identity,
                None, None, no_greedy)



[docs]
def dc_megablast(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10, parse_deflines=False,
        num_threads=1, word_size=11, gapopen=None, gapextend=None, reward=2,
        penalty=-3, strand='both', no_dust=False, no_soft_masking=False,
        lcase_masking=False, perc_identity=0, template_type='coding',
        template_length=18):
    """
    Dicontinuous ``megablast`` similarity search. This is designed for similar
    sequences (less similar than :func:`.megablast`) using a nucleotide
    query on a nucleotide database.

    .. include:: blast_options_common
    .. include:: blast_options_blastn
    :param template_type: template type for for dc-megablast. Possible
        values are ``"coding"`` (default), ``"optimal"``, and ``"coding_and_optimal"``.
    :param template_length: template length for dc-megablast. Possible
        values are 16, 18 (the default), and 21.
    """
    return _blastn(query, db, subject, 'dc-megablast', query_loc,
               subject_loc, evalue,parse_deflines, num_threads,
               word_size, gapopen, gapextend, reward, penalty, strand,
               no_dust, no_soft_masking, lcase_masking, perc_identity,
               template_type, template_length, False)



[docs]
def blastn(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10, parse_deflines=False,
        num_threads=1, word_size=11, gapopen=None, gapextend=None, reward=2,
        penalty=-3, strand='both', no_dust=False, no_soft_masking=False,
        lcase_masking=False, perc_identity=0):
    """
    ``blastn`` similarity search. This is designed for distant sequences using a
    nucleotide query on a nucleotide database.

    .. include:: blast_options_common
    .. include:: blast_options_blastn
    :return: A :class:`BlastOutput` instance.
    """
    return _blastn(query, db, subject, 'blastn', query_loc, subject_loc,
               evalue, parse_deflines, num_threads, word_size, gapopen,
               gapextend, reward, penalty, strand, no_dust,
               no_soft_masking, lcase_masking, perc_identity, None,
               None, False)



[docs]
def blastn_short(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=1000, parse_deflines=False,
        num_threads=1, word_size=7, gapopen=None, gapextend=None, reward=1,
        penalty=-3, strand='both', no_dust=True, no_soft_masking=True,
        lcase_masking=False, perc_identity=0):
    """
    ``blastn`` for short sequences. This is optimised for query sequences
    up to 50 bp long. It automatically sets ``evalue=1000``, ``word_size=7``,
    ``no_dust=True``, ``no_soft_masking=True``, ``reward=1``, ``penalty=-3``,
    ``gapopen=5`` and ``gapextend=2``.

    .. include:: blast_options_common
    .. include:: blast_options_blastn
    :return: A :class:`BlastOutput` instance.
    """
    return _blastn(query, db, subject, 'blastn-short', query_loc,
               subject_loc, evalue, parse_deflines, num_threads,
               word_size, gapopen, gapextend, reward, penalty, strand,
               no_dust, no_soft_masking, lcase_masking, perc_identity,
               None, None, False)


@_utils._protect_run
def _blastn(query, db, subject, task, query_loc, subject_loc, evalue,
    parse_deflines, num_threads, word_size, gapopen, gapextend, reward,
    penalty, strand, no_dust, no_soft_masking, lcase_masking,
    perc_identity, template_type, template_length, no_greedy):

    cmd, stdin = _get_common_blast_options('blastn', query, db, subject,
                        query_loc, subject_loc, evalue, parse_deflines, num_threads)

    # process task
    cmd.extend(['-task', task])

    # word size
    if not isinstance(word_size, int): raise TypeError('`word_size` must be an integer')
    if task == 'megablast' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for megablast')
    if task == 'dc-megablast' and word_size not in [11, 12]: raise ValueError('invalid value for `word_size`: must be 11 or 12 for dc-megablast')
    if task == 'blastn' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for blastn')
    if task == 'blastn-short' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for blastn-short')
    cmd.extend(['-word_size', str(word_size)])

    # reward/penalty
    if not isinstance(reward, int): raise TypeError('`reward` must be an integer')
    if reward < 0: raise ValueError('`reward` must be >= 0')
    if not isinstance(penalty, int): raise TypeError('`penalty` must be an integer')
    if penalty > 0: raise ValueError('`penalty` must be <= 0')
    if (reward, penalty) not in _blastn_costs:
        raise ValueError('invalid set of values for reward/penalty: ({0},{1})'.format(reward, penalty))

    # gap costs
    if gapopen is None:
        if gapextend is not None: raise ValueError('if `gapopen` is ``None``, `gapextend` should be also')
        gapopen, gapextend = _blastn_costs[(reward, penalty)][0]
    elif gapextend is None:
        raise ValueError('if `gapextend` is ``None``, `gapopen` should be also')

    if not isinstance(gapopen, int): raise TypeError('`gapopen` must be an integer')
    if gapopen < 0: raise ValueError('`gapopen` must be >= 0')
    if not isinstance(gapextend, int): raise TypeError('`gapextend` must be an integer')
    if gapextend < 0: raise ValueError('`gapextend` must be >= 0')
    if (gapopen, gapextend) not in _blastn_costs[(reward, penalty)][1]:
        raise ValueError('invalid set of values for gapopen/gapextend: ({0},{1})'.format(gapopen, gapextend))
    cmd.extend(['-gapopen', str(gapopen), '-gapextend', str(gapextend),
                '-reward', str(reward), '-penalty', str(penalty)])

    # strand
    if not isinstance(strand, str): raise TypeError('`strand` must be a string')
    if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"')
    cmd.extend(['-strand', strand])

    # filter/mask flags
    if no_dust: cmd.extend(['-dust', 'no'])
    if no_soft_masking: cmd.extend(['-soft_masking', 'false'])
    if lcase_masking: cmd.append('-lcase_masking')

    #  perc_identity
    if not isinstance(perc_identity, int): raise TypeError('`perc_identity` must be an integer')
    if perc_identity < 0 or perc_identity > 100: raise ValueError('`perc_identity` out of range')
    cmd.extend(['-perc_identity', str(perc_identity)])

    # template parameters
    if task == 'dc-megablast':
        if not isinstance(template_type, str): raise TypeError('`template_type` must be a string')
        if template_type not in ['coding', 'optimal', 'coding_and_optimal']: raise ValueError('invalid value for `template_type`')
        if not isinstance(template_length, int): raise TypeError('`template_length` must be an integer')
        if template_length not in [16, 18, 21]: raise ValueError('invalid value for `template_length`')
        cmd.extend(['-template_type', template_type])
        cmd.extend(['-template_length', str(template_length)])

    # aligment flags
    if no_greedy: cmd.append('-no_greedy')

    # run the program
    p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE,
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    stdout, stderr = p.communicate(stdin)
    if p.returncode != 0:
        raise RuntimeError('error while running blastn (message below)\n' + stderr)
    return _parse_blast_output(stdout)


[docs]
def blastp(query, db=None, subject=None, query_loc=None, subject_loc=None,
        evalue=10, parse_deflines=False, num_threads=1, word_size=None,
        gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=11,
        comp_based_stats=2, seg=0, soft_masking=False,
        lcase_masking=False, window_size=40, use_sw_tback=False):
    """
    ``bastp`` similary search. This is designed for using a protein query
     on a protein database.

    .. include:: blast_options_common
    .. include:: blast_options_prot

    :param parse_deflines: parse query and subject bar delimited
        sequence identifiers.
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).
    :param soft_masking: apply filtering locations as soft masks.
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param use_sw_tback: compute locally optimal Smith-Waterman
        alignments.

    :return: A :class:`BlastOutput` instance.
    """
    return _blastp(query, db, subject, 'blastp', query_loc, subject_loc,
        evalue, parse_deflines, num_threads, word_size, gapopen,
        gapextend, matrix, threshold, comp_based_stats, seg,
        soft_masking, lcase_masking, window_size, use_sw_tback)



[docs]
def blastp_short(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10, parse_deflines=False,
        num_threads=1, word_size=None, gapopen=None, gapextend=None,
        matrix='PAM30', threshold=16, comp_based_stats=0, seg=0,
        lcase_masking=False, window_size=15, use_sw_tback=False):
    """
    ``blastp`` similarity search for short sequences.

    .. include:: blast_options_common
    .. include:: blast_options_prot

    :param parse_deflines: parse query and subject bar delimited
        sequence identifiers.
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param use_sw_tback: compute locally optimal Smith-Waterman
        alignments.

    :return: A :class:`BlastOutput` instance.
    """
    return _blastp(query, db, subject, 'blastp-short', query_loc, subject_loc,
        evalue, parse_deflines, num_threads, word_size, gapopen,
        gapextend, matrix, threshold, comp_based_stats, seg, False,
        lcase_masking, window_size, use_sw_tback)



[docs]
def blastp_fast(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10, parse_deflines=False,
        num_threads=1, word_size=None, threshold=21, comp_based_stats=2,
        seg=0, lcase_masking=False, window_size=40, use_sw_tback=False):
    """
    Quick ``blastp`` similarity search.

    .. include:: blast_options_common

    :param threshold: minimum word score such that the word is added to
        the BLAST lookup table (>0).
    :param seg: filter query sequence with SEG as an integer (0 to
        disable, 1 to enable, or alternatively a tuple with the three
        parameters *window*, *locut*, and *hicut*).
    :param window_size: multiple hits window size (use 0 to specify
        1-hit algorithm).
    :param parse_deflines: parse query and subject bar delimited
        sequence identifiers.
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param use_sw_tback: compute locally optimal Smith-Waterman
        alignments.

    :return: A :class:`BlastOutput` instance.
    """
    return _blastp(query, db, subject, 'blastp-fast', query_loc, subject_loc,
        evalue, parse_deflines, num_threads, word_size, None,
        None, None, threshold, comp_based_stats, seg, False,
        lcase_masking, window_size, use_sw_tback)


@_utils._protect_run
def _blastp(query, db, subject, task, query_loc, subject_loc, evalue,
    parse_deflines, num_threads, word_size, gapopen, gapextend, matrix,
    threshold, comp_based_stats, seg, soft_masking, lcase_masking,
    window_size, use_sw_tback):

    cmd, stdin = _get_common_blast_options('blastp', query, db, subject,
                        query_loc, subject_loc, evalue, parse_deflines, num_threads)
    _get_common_protein_options(cmd, task, word_size, matrix, threshold,
        gapopen, gapextend, seg, window_size, comp_based_stats)

    # process task
    cmd.extend(['-task', task])

    # filter/mask flags
    if task == 'blastp':
        if soft_masking: cmd.extend(['-soft_masking', 'true'])
        else: cmd.extend(['-soft_masking', 'false'])
    if lcase_masking: cmd.append('-lcase_masking')

    # use_sw_tback
    if use_sw_tback:
        cmd.append('-use_sw_tback')

    # run the program
    p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    stdout, stderr = p.communicate(stdin)
    if  p.returncode != 0:
        raise RuntimeError('error while running blastp (message below)\n' + stderr)
    return _parse_blast_output(stdout)


[docs]
def blastx(query, db=None, subject=None, query_loc=None, subject_loc=None,
        evalue=10, num_threads=1, word_size=None, gapopen=None,
        gapextend=None, matrix='BLOSUM62', threshold=12,
        seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False,
        window_size=40, strand='both', query_genetic_code=1,
        max_intron_length=0, comp_based_stats=2):
    """
    ``blastx`` similarity search. Designed for using a translated nucleotide
    query on a protein database.

    .. include:: blast_options_common
    .. include:: blast_options_prot
    :param soft_masking: apply filtering locations as soft masks.
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param query_genetic_code: genetic code to translate query. Allowed
        values are: 1-6, 9-16, 21-25.
    :param max_intron_length: length of the largest intron allowed in a
        translated nucleotide sequence when linking multiple distinct
        alignments (a negative value disables linking).
    :param strand: query strand(s) to search against database/subject.
        Choice of both, minus, or plus.
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).

    :return: A :class:`BlastOutput` instance.

"""
    return _blastx('blastx', query, db, subject, query_loc, subject_loc,
        evalue, False, num_threads, word_size, gapopen,
        gapextend, matrix, threshold, seg, soft_masking, lcase_masking,
        window_size, strand, query_genetic_code, max_intron_length,
        comp_based_stats)



[docs]
def blastx_fast(query, db=None, subject=None, query_loc=None,
        subject_loc=None, evalue=10,  num_threads=1, word_size=None,
        gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=21,
        seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False,
        window_size=40, strand='both', query_genetic_code=1,
        max_intron_length=0, comp_based_stats=2):
    """
    Quick ``blastx`` similarity search. Designed for using a translated nucleotide
    query on a protein database and optimised for faster execution.

    .. include:: blast_options_common
    .. include:: blast_options_prot
    :param soft_masking: apply filtering locations as soft masks.
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param query_genetic_code: genetic code to translate query. Allowed
        values are: 1-6, 9-16, 21-25.
    :param max_intron_length: length of the largest intron allowed in a
        translated nucleotide sequence when linking multiple distinct
        alignments (a negative value disables linking).
    :param strand: query strand(s) to search against database/subject.
        Choice of both, minus, or plus.
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).

    :return: A :class:`BlastOutput` instance.
    """
    return _blastx('blastx-fast', query, db, subject, query_loc,
        subject_loc, evalue, False, num_threads, word_size,
        gapopen, gapextend, matrix, threshold, seg, soft_masking,
        lcase_masking, window_size, strand, query_genetic_code,
        max_intron_length, comp_based_stats)


@_utils._protect_run
def _blastx(task, query, db, subject, query_loc,
        subject_loc, evalue, parse_deflines, num_threads, word_size,
        gapopen, gapextend, matrix, threshold, seg, soft_masking,
        lcase_masking, window_size, strand, query_genetic_code,
        max_intron_length, comp_based_stats):

    cmd, stdin = _get_common_blast_options('blastx', query, db, subject,
        query_loc, subject_loc, evalue, parse_deflines, num_threads)
    cmd.extend(['-task', task])

    _get_common_protein_options(cmd, task, word_size, matrix, threshold,
        gapopen, gapextend, seg, window_size, comp_based_stats)

    if soft_masking: cmd.extend(['-soft_masking', 'true'])
    else: cmd.extend(['-soft_masking', 'false'])
    if lcase_masking: cmd.append('-lcase_masking')

    if not isinstance(strand, str): raise TypeError('`strand` must be a string')
    if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"')
    cmd.extend(['-strand', strand])

    if query_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]:
        raise ValueError('invalid genetic code: {0}'.format(query_genetic_code))
    cmd.extend(['-query_gencode', str(query_genetic_code)])

    if not isinstance(max_intron_length, int): raise TypeError('`max_intron_length` must be an integer')
    if max_intron_length < 0: raise ValueError('`max_intron_length` must be >= 0')
    cmd.extend(['-max_intron_length', str(max_intron_length)])

    # run the program
    p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    stdout, stderr = p.communicate(stdin)
    if p.returncode != 0:
        raise RuntimeError('error while running blastx (message below)\n' + stderr)
    return _parse_blast_output(stdout)


[docs]
def tblastn(query, db=None, subject=None, query_loc=None, subject_loc=None,
        evalue=10, num_threads=1, word_size=None, gapopen=None,
        gapextend=None, matrix='BLOSUM62', threshold=13,
        seg=(12, 2.2, 2.5), soft_masking=False, window_size=40,
        db_genetic_code=1, max_intron_length=0, comp_based_stats=2):
    """
    ``tblastn`` similary search. Designed for using a protein query on
    a translated nucleotide database.

    .. include:: blast_options_common
    .. include:: blast_options_prot

    :param soft_masking: apply filtering locations as soft masks.
    :param db_genetic_code: genetic code to translate subject sequences.
        Allowed values are: 1-6, 9-16, 21-25.
    :param max_intron_length: length of the largest intron allowed in a
        translated nucleotide sequence when linking multiple distinct
        alignments (a negative value disables linking).
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).

    :return: A :class:`BlastOutput` instance.
    """
    return _tblastn('tblastn', query, db, subject, query_loc, subject_loc,
        evalue, num_threads, word_size, gapopen, gapextend, matrix,
        threshold, seg, soft_masking, window_size, db_genetic_code,
        max_intron_length, comp_based_stats)



[docs]
def tblastn_fast(query, db=None, subject=None, query_loc=None, subject_loc=None,
        evalue=10, num_threads=1, word_size=None, gapopen=None,
        gapextend=None, matrix='BLOSUM62', threshold=21,
        seg=(12, 2.2, 2.5), soft_masking=False, window_size=40,
        db_genetic_code=1, max_intron_length=0, comp_based_stats=2):
    """
    Quich ``tblastn`` similary search. Designed for using a protein query on
    a translated nucleotide database and optimised for fast execution.

    .. include:: blast_options_common
    .. include:: blast_options_prot

    :param soft_masking: apply filtering locations as soft masks.
    :param db_genetic_code: genetic code to translate subject sequences.
        Allowed values are: 1-6, 9-16, 21-25.
    :param max_intron_length: length of the largest intron allowed in a
        translated nucleotide sequence when linking multiple distinct
        alignments (a negative value disables linking).
    :param comp_based_stats: composition-based statistics, as an integer
        code: 0 (no composition-based statistics), 1 (composition-based
        statistics as in NAR 29:2994-3005, 2001), 2 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        conditioned on sequence properties), or 3 (composition-based
        score adjustment as in Bioinformatics 21:902-911, 2005,
        unconditionally).

    :return: A :class:`BlastOutput` instance.
    """
    return _tblastn('tblastn-fast', query, db, subject, query_loc, subject_loc,
        evalue, num_threads, word_size, gapopen, gapextend, matrix,
        threshold, seg, soft_masking, window_size, db_genetic_code,
        max_intron_length, comp_based_stats)


@_utils._protect_run
def _tblastn(task, query, db, subject, query_loc, subject_loc,
        evalue, num_threads, word_size, gapopen, gapextend, matrix,
        threshold, seg, soft_masking, window_size, db_genetic_code,
        max_intron_length, comp_based_stats):

    # process common options
    cmd, stdin = _get_common_blast_options('tblastn', query, db, subject,
                        query_loc, subject_loc, evalue, False, num_threads)
    _get_common_protein_options(cmd, task, word_size, matrix, threshold,
        gapopen, gapextend, seg, window_size, comp_based_stats)

    # process task
    cmd.extend(['-task', task])

    # filter/mask flags
    if soft_masking: cmd.extend(['-soft_masking', 'true'])
    else: cmd.extend(['-soft_masking', 'false'])

    # db_genetic_code
    if db_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]:
        raise ValueError('invalid genetic code: {0}'.format(db_genetic_code))
    cmd.extend(['-db_gencode', str(db_genetic_code)])

    # max_intron_length
    if not isinstance(max_intron_length, int): raise TypeError('`max_intron_length` must be an integer')
    if max_intron_length < 0: raise ValueError('`max_intron_length` must be >= 0')
    cmd.extend(['-max_intron_length', str(max_intron_length)])

    # run the program
    p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    stdout, stderr = p.communicate(stdin)
    if p.returncode != 0:
        raise RuntimeError('error while running tblastn (message below)\n' + stderr)
    return _parse_blast_output(stdout)

@_utils._protect_run
def tblastx(query, db=None, subject=None, query_loc=None, subject_loc=None,
        evalue=10, num_threads=1, word_size=None, matrix='BLOSUM62',
        seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False,
        window_size=40, db_genetic_code=1, query_genetic_code=1,
        strand='both'):
    """
    ``tblastx`` similary search. Designed for using a translated
    nucleotide query on a translated nucleotide database.

    .. include:: blast_options_common

    :param matrix: scoring matrix name. Available values are: PAM-30,
        PAM-70, BLOSUM-80, and BLOSUM-62.
    :param seg: filter query sequence with SEG as an integer (0 to
        disable, 1 to enable, or alternatively a tuple with the three
        parameters *window*, *locut*, and *hicut*).
    :param window_size: multiple hits window size (use 0 to specify
        1-hit algorithm).
    :param soft_masking: apply filtering locations as soft masks.
    :param lcase_masking: use lower case filtering in query and subject
        sequences.
    :param query_genetic_code: genetic code to translate query. Allowed
        values are: 1-6, 9-16, 21-25.
    :param db_genetic_code: genetic code to translate subject sequences.
        Allowed values are: 1-6, 9-16, 21-25.
    :param strand: query strand(s) to search against database/subject.
        Choice of both, minus, or plus.

    :return: A :class:`BlastOutput` instance.
    """

    # process common options
    cmd, stdin = _get_common_blast_options('tblastx', query, db, subject,
                        query_loc, subject_loc, evalue, False, num_threads)
    _get_common_protein_options(cmd, 'tblastx', word_size, matrix, None,
        None, None, seg, window_size, None)

    # filter/mask flags
    if soft_masking: cmd.extend(['-soft_masking', 'true'])
    else: cmd.extend(['-soft_masking', 'false'])
    if lcase_masking: cmd.append('-lcase_masking')

    # query_genetic_code
    if query_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]:
        raise ValueError('invalid genetic code: {0}'.format(query_genetic_code))
    cmd.extend(['-query_gencode', str(query_genetic_code)])

    # db_genetic_code
    if db_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]:
        raise ValueError('invalid genetic code: {0}'.format(db_genetic_code))
    cmd.extend(['-db_gencode', str(db_genetic_code)])

    # strand
    if not isinstance(strand, str): raise TypeError('`strand` must be a string')
    if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"')
    cmd.extend(['-strand', strand])

    # run the program
    p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    stdout, stderr = p.communicate(stdin)
    if p.returncode != 0:
        raise RuntimeError('error while running tblastx (message below)\n' + stderr)
    return _parse_blast_output(stdout)

def _get_common_blast_options(command, query, db, subject, query_loc,
                              subject_loc, evalue, parse_deflines,
                              num_threads):
    # get command name
    path = _blast_apps[command].get_path()
    if path is None:
        raise RuntimeError('{0} program not available -- please configure path'.format(command))
    cmd = [path, '-query', '-', '-outfmt', '5']

    # process query
    if isinstance(query, str):
        stdin = '>lcl|Query\n' + query + '\n'
        queryL = len(query)
    elif isinstance(query, _interface.SequenceView):
        if command in ['blastn', 'blastx', 'tblastx'] and query._parent._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA')
        if command in ['blastp', 'tblastn'] and query._parent._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein')
        stdin = '>lcl|Query\n' + query.string() + '\n'
        queryL = len(query)
    elif isinstance(query, _interface.SampleView):
        if command in ['blastn', 'blastx', 'tblastx'] and query._parent._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA')
        if command in ['blastp', 'tblastn'] and query._parent._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein')
        stdin = '>lcl|{0}\n{1}\n'.format(query.name, query.sequence.string())
        queryL = len(query.sequence)
    elif isinstance(query, (_interface.Align, _interface.Container)):
        if command in ['blastn', 'blastx', 'tblastx'] and query._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA')
        if command in ['blastp', 'tblastn'] and query._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein')
        stdin = query.fasta()
    else: raise TypeError('`query` must be an Align, a Container, a SampleView, a SequenceView, or a string')

    # process db/subject
    if db is not None and subject is not None:
        raise ValueError('only one of `db` and `subject` must be specified')
    if db is None and subject is None:
        raise ValueError('either `db` or `subject` must be specified')
    elif db is not None:
        if not isinstance(db, str): raise ValueError('`db` must be a string')
        if not os.path.isabs(db):
            db = os.path.normpath(os.path.join(_utils._protect_path_mapping[os.getcwd()], db))
        cmd.extend(['-db', db])
    else:
        f = open('in.fas', 'w')
        if isinstance(subject, str):
            f.write('>lcl|Subject\n{0}\n'.format(subject))
            subjectL = len(subject)
        elif isinstance(subject, _interface.SequenceView):
            if command in ['blastn', 'tblastn', 'tblastx'] and subject._parent._alphabet != alphabets.DNA: raise ValueError('subject alphabet must be DNA')
            if command in ['blastp', 'blastx'] and subject._parent._alphabet != alphabets.protein: raise ValueError('subject alphabet must be protein')
            f.write('>lcl|Subject\n{0}\n'.format(subject.string()))
            subjectL = len(subject)
        elif isinstance(subject, _interface.SampleView):
            if command in ['blastn', 'tblastn', 'tblastx'] and subject._parent._alphabet != alphabets.DNA: raise ValueError('subject alphabet must be DNA')
            if command in ['blastp', 'blastx'] and subject._parent._alphabet != alphabets.protein: raise ValueError('subject alphabet must be protein')
            f.write('>lcl|{0}\n{1}\n'.format(subject.name, subject.sequence.string()))
            subjectL = len(subject.sequence)
        else: raise TypeError('`subject` must be a SampleView, a SequenceView, or a string')
        cmd.extend(['-subject', 'in.fas'])

    # process query_loc
    if query_loc is not None:
        if isinstance(query, _interface.Container): raise ValueError('`query_loc` is not supported if `query` is a Container')
        if not isinstance(query_loc, (list, tuple)): raise TypeError('`query_loc` must be a list or a tuple')
        if len(query_loc) != 2: raise ValueError('`query_loc` must have two items')
        if not isinstance(query_loc[0], int) or not isinstance(query_loc[1], int): raise TypeError('`query_loc` must contain two integers')
        if query_loc[0] >= queryL or query_loc[0] < 0 or query_loc[1] < query_loc[0] + 1: raise ValueError('invalid values for `query_loc`')
        cmd.extend(['-query_loc', '{0}-{1}'.format(query_loc[0]+1, query_loc[1])])

    # process subject_loc
    if subject_loc is not None:
        if subject is None: raise ValueError('`subject_loc` cannot be specified if `subject` is not specified')
        if not isinstance(subject_loc, (list, tuple)): raise TypeError('`subject_loc` must be a list or a tuple')
        if len(subject_loc) != 2: raise ValueError('`subject_loc` must have two items')
        if not isinstance(subject_loc[0], int) or not isinstance(subject_loc[1], int): raise TypeError('`subject_loc` must contain two integers')
        if subject_loc[0] >= subjectL or subject_loc[0] < 0 or subject_loc[1] < subject_loc[0] + 1: raise ValueError('invalid values for `subject_loc`')
        cmd.extend(['-subject_loc', '{0}-{1}'.format(subject_loc[0]+1, subject_loc[1])])

    # process evalue
    if evalue is not None:
        if not isinstance(evalue, (int, float)): raise TypeError('`evalue` must be a real value')
        if evalue <= 0: raise ValueError('`evalue` must be > 0')
        cmd.extend(['-evalue', str(evalue)])

    # parse_deflines
    if parse_deflines: cmd.append('-parse_deflines')

    # num_threads
    if not isinstance(num_threads, int): raise TypeError('`num_threads` must be an integer')
    if num_threads < 1: raise ValueError('`num_threads` must be >= 1')
    if num_threads > 1 and subject is not None: raise ValueError('`num_threads cannot be > 1 if `subject` is used')
    cmd.extend(['-num_threads', str(num_threads)])

    return cmd, stdin

def _get_common_protein_options(cmd, task, word_size, matrix, threshold,
        gapopen, gapextend, seg, window_size, comp_based_stats):

    # word size
    if word_size is not None:
        if not isinstance(word_size, int): raise TypeError('`word_size` must be an integer')
        if word_size < 2 or word_size > 7: raise ValueError('invalid value for `word_size`: must be in range [2,7]')
        cmd.extend(['-word_size', str(word_size)])

    # matrix
    if matrix is not None:
        if matrix not in ['PAM30', 'PAM70', 'BLOSUM80', 'BLOSUM62']:
            raise ValueError('invalid value for `matrix`: {0}'.format(matrix))
        cmd.extend(['-matrix', matrix])

    # gap costs
    if task not in ['blastp-fast', 'tblastx']:
        if gapopen is None:
            if gapextend is not None: raise ValueError('if `gapopen is ``None``, `gapextend` must be also')
            gapopen, gapextend = _blastp_costs_defaults[matrix]
        else:
            if gapextend is None: raise ValueError('if `gapextend is ``None``, `gapopen` must be also')
            if not isinstance(gapopen, int): raise TypeError('`gapopen` must be an integer')
            if not isinstance(gapextend, int): raise TypeError('`gapextend` must be an integer')
            if (gapopen, gapextend) not in _blastp_costs[matrix]: raise ValueError('invalid gapopen/gapextend combination for {0}'.format(matrix))
        cmd.extend(['-gapopen', str(gapopen), '-gapextend', str(gapextend)])

    # threshold
    if threshold is not None:
        if not isinstance(threshold, int): raise TypeError('`threshold` must be an integer')
        if threshold <= 0: raise ValueError('invalid value for `treshold`: must be > 0')
        cmd.extend(['-threshold', str(threshold)])

    # seg
    if seg is not None:
        if seg == 0: seg = 'no'
        elif seg == 1: seg = 'yes'
        elif not isinstance(seg, (tuple, list)):
            raise ValueError('invalid value for `seg`: must be 0, 1, or a tuple/list of 3 items')
        else:
            try:
                window, locut, hicut = seg
                window = int(window)
            except ValueError:
                raise ValueError('invalid value for `seg`')
            if window < 1: raise ValueError('`window` (seg parameter) must be > 0')
            locut = float(locut)
            if locut <= 0: raise ValueError('`locut` (seg parameter) must be > 0')
            hicut = float(hicut)
            if hicut <= locut: raise ValueError('`hicut` (seg parameter) must be > `locut`')
            seg = '{0} {1} {2}'.format(window, locut, hicut)
        cmd.extend(['-seg', seg])

    # window_size
    if window_size is not None:
        if not isinstance(window_size, int): raise TypeError('`window_size` must be an integer')
        if window_size < 0: raise ValueError('invalid value for `window_size`: {0}'.format(window_size))
        cmd.extend(['-window_size', str(window_size)])

    # comp_based_stats
    if comp_based_stats is not None:
        if comp_based_stats not in [0, 1, 2, 3]:
            raise ValueError('invalid value for `comp_based_stats`')
        cmd.extend(['-comp_based_stats', str(comp_based_stats)])

def _xml_find(node, tag):
    res = node.find(tag)
    if res is None: raise ValueError('invalid format of BLAST XML output: cannot find `{0}` under `{1}`'.format(tag, node.tag))
    return res

def _xml_find_type(node, tag, type_):
    value = _xml_find(node, tag).text
    try: return type_(value)
    except ValueError:
        raise ValueError('invalid format of BLAST XML output: invalid value for `{0}`: {1}'.format(tag, value))

def _parse_blast_output(string):
    root = xml.etree.ElementTree.fromstring(string)
    if root.tag != 'BlastOutput': raise ValueError('invalid format of BLAST XML output: no `BlastOutput` at root')

    out = BlastOutput()
    out._program = _xml_find(root, 'BlastOutput_program').text
    out._version = _xml_find(root, 'BlastOutput_version').text
    out._reference = _xml_find(root, 'BlastOutput_reference').text
    out._db = _xml_find(root, 'BlastOutput_db').text
    out._query_ID = _xml_find(root, 'BlastOutput_query-ID').text
    out._query_def = _xml_find(root, 'BlastOutput_query-def').text
    out._query_len = _xml_find_type(root, 'BlastOutput_query-len',  int)

    node = _xml_find(root, 'BlastOutput_param')
    nodes = node.findall('Parameters')
    if len(nodes) != 1: raise ValueError('invalid format of BLAST XML output: expect one set of BLAST parameters')
    node = nodes[0]

    out._params = {}
    for child in node:
        if child.tag[:11] != 'Parameters_':
            raise ValueError('invalid parameter in BLAST output: {0}'.format(child.tag))
        tag = child.tag[11:]
        if tag in out._params:
            raise ValueError('parameter repeated in BLAST output: {0}'.format(child.tag))
        try: out._params[tag] = int(child.text)
        except ValueError:
            try: out._params[tag] = float(child.text)
            except ValueError:
                out._params[tag] = child.text

    iterations = _xml_find(root, 'BlastOutput_iterations')
    for query_node in iterations:
        if query_node.tag != 'Iteration': raise ValueError('invalid format of BLAST XML output: unexpected node under `BlastOutput_iterations`: {0}'.format(query_node.tag))
        query_hits = BlastQueryHits()
        i = _xml_find_type(query_node, 'Iteration_iter-num', int)
        if i != len(out._query_hits) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Iteration_iter-num`: {0}'.format(i))
        query_hits._num = i - 1
        query_hits._query_ID = _xml_find(query_node, 'Iteration_query-ID').text
        query_hits._query_def = _xml_find(query_node, 'Iteration_query-def').text
        query_hits._query_len = _xml_find_type(query_node, 'Iteration_query-len', int)
        node = _xml_find(query_node, 'Iteration_stat')
        nodes = node.findall('Statistics')
        if len(nodes) != 1: raise ValueError('invalid format of BLAST XML output: expect one set of iteration statistics')
        node = nodes[0]
        query_hits._db_num = _xml_find_type(node, 'Statistics_db-num', int)
        query_hits._db_len = _xml_find_type(node, 'Statistics_db-len', int)
        query_hits._hsp_len = _xml_find_type(node, 'Statistics_hsp-len', int)
        query_hits._eff_space = _xml_find_type(node, 'Statistics_eff-space', int)
        query_hits._kappa = _xml_find_type(node, 'Statistics_kappa', float)
        query_hits._lambda = _xml_find_type(node, 'Statistics_lambda', float)
        query_hits._entropy = _xml_find_type(node, 'Statistics_entropy', float)
        out._query_hits.append(query_hits)

        hits = _xml_find(query_node, 'Iteration_hits')
        for hit_node in hits:
            if hit_node.tag != 'Hit': raise ValueError('invalid format of BLAST XML output: unexpected node under `Iteration_hits`: {0}'.format(hit_node.tag))
            hit = BlastHit()
            i = _xml_find_type(hit_node, 'Hit_num', int)
            if i != len(query_hits._hits) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Hit_num`: {0}'.format(i))
            hit._num = i - 1
            hit._id = _xml_find(hit_node, 'Hit_id').text
            hit._def = _xml_find(hit_node, 'Hit_def').text
            hit._accession = _xml_find(hit_node, 'Hit_accession').text
            hit._len = _xml_find_type(hit_node, 'Hit_len', int)
            query_hits._hits.append(hit)

            Hsps = _xml_find(hit_node, 'Hit_hsps')
            for Hsp_node in Hsps:
                if Hsp_node.tag != 'Hsp': raise ValueError('invalid format of BLAST XML output: unexpected node under `Hit_hsps`: {0}'.format(Hsp_node.tag))
                Hsp = BlastHsp()
                i = _xml_find_type(Hsp_node, 'Hsp_num', int)
                if i != len(hit._hsp) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Hsp_num`: {0}'.format(i))
                Hsp._num = i - 1
                Hsp._bit_score = _xml_find_type(Hsp_node, 'Hsp_bit-score', float)
                Hsp._evalue = _xml_find_type(Hsp_node, 'Hsp_evalue', float)
                Hsp._identity = _xml_find_type(Hsp_node, 'Hsp_identity', int)
                Hsp._gaps = _xml_find_type(Hsp_node, 'Hsp_gaps', int)
                Hsp._positive = _xml_find_type(Hsp_node, 'Hsp_positive', int)
                Hsp._align_len = _xml_find_type(Hsp_node, 'Hsp_align-len', int)
                Hsp._qseq = _xml_find(Hsp_node, 'Hsp_qseq').text
                Hsp._midline = _xml_find(Hsp_node, 'Hsp_midline').text
                Hsp._hseq = _xml_find(Hsp_node, 'Hsp_hseq').text
                if len(Hsp._qseq) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `qseq` and `align-len`')
                if len(Hsp._hseq) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `hseq` and `align-len`')
                if len(Hsp._midline) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `midline` and `align-len`')

                qfrom = _xml_find_type(Hsp_node, 'Hsp_query-from', int)
                qto = _xml_find_type(Hsp_node, 'Hsp_query-to', int)
                Hsp._query_frame = _xml_find_type(Hsp_node, 'Hsp_query-frame', int)
                if out._program in ['blastp', 'tblastn']:
                    if Hsp._query_frame != 0:  raise ValueError('invalid value for `Hsp_query-frame`: {0}'.format(Hsp._query_frame))
                    if qto <= qfrom: raise ValueError('invalid values for `Hsp_query-from/to`: {0} and {1} with positive frame'.format(qfrom, qto))
                    Hsp._query_start = qfrom - 1
                    Hsp._query_stop = qto
                else:
                    if Hsp._query_frame not in [-3, -2, -1, 1, 2, 3]: raise ValueError('invalid value for `Hsp_query-frame`: {0}'.format(Hsp._query_frame))
                    qfrom, qto = sorted([qfrom, qto])
                    Hsp._query_start = qfrom - 1
                    Hsp._query_stop = qto

                hfrom = _xml_find_type(Hsp_node, 'Hsp_hit-from', int)
                hto = _xml_find_type(Hsp_node, 'Hsp_hit-to', int)
                Hsp._hit_frame = _xml_find_type(Hsp_node, 'Hsp_hit-frame', int)
                if out._program in ['blastp', 'blastx']:
                    if Hsp._hit_frame != 0:  raise ValueError('invalid value for `Hsp_hit-frame`: {0}'.format(Hsp._hit_frame))
                    if hto <= hfrom: raise ValueError('invalid values for `Hsp_hit-from/to`: {0} and {1} with positive frame'.format(hfrom, hto))
                    Hsp._hit_start = hfrom - 1
                    Hsp._hit_stop = hto
                else:
                    if Hsp._hit_frame not in [-3, -2, -1, 1, 2, 3]: raise ValueError('invalid value for `Hsp_hit-frame`: {0}'.format(Hsp._hit_frame))
                    hfrom, hto = sorted([hfrom, hto])
                    Hsp._hit_start = hfrom - 1
                    Hsp._hit_stop = hto

                hit._hsp.append(Hsp)
    return out


[docs]
class BlastOutput(object):
    """
    Full results of a BLAST run.
    """
    def __init__(self):
        self._program = None
        self._version = None
        self._reference = None
        self._db = None
        self._query_ID = None
        self._query_def = None
        self._query_len = None
        self._params = None
        self._query_hits = []

    @property
    def program(self):
        """ Name of the program used."""
        return self._program

    @property
    def version(self):
        """ Version of the program. """
        return self._version

    @property
    def reference(self):
        """ Bibliographic reference. """
        return self._reference

    @property
    def db(self):
        """ Name of the database used. """
        return self._db

    @property
    def query_ID(self):
        """ Identifier of the query. """
        return self._query_ID

    @property
    def query_def(self):
        """ Description of the query. """
        return self._query_def

    @property
    def query_len(self):
        """ Length of the query. """
        return self._query_len

    @property
    def params(self):
        """
        Search parameters.
        
        ``"expect"``: E-value, ``"reward"``:
        nucleotide match reward, ``"penalty"``: nucleotide
        mismatch reward, ``"gapopen"``: cost for opening a gap,
        ``"gapextend"``: cost for extending a gap.
        ``"filter"``: filter string.
        """
        return self._params

    @property
    def num_queries(self):
        """
        Number of queries used in the BLAST search. ``blast_output.num_queries``
        is also available as ``len(blast_output)``.
        """
        return len(self._query_hits)

    def __len__(self):
        return len(self._query_hits)


[docs]
    def iter_queries(self):
        """
        Iterator over queries. Allows to iterate over
        :class:`.BlastQueryHits` instances for all queries.
        ``for query_hit in blast_output.iter_queries()`` is
        also available as ``for query_hit in blast_output``.
        """
        return self.__iter__()


    def __iter__(self):
        for i in self._query_hits: yield i


[docs]
    def get_query(self, i):
        """
        Hits for a given query. An instance of :class:`.BlastQueryHits`
        is returned.
        ``blast_output.get_query(i)`` is also available as ``blast_output[i]``.
        """
        return self._query_hits[i]


    def __getitem__(self, i):
        return self._query_hits[i]

    @property
    def num_hits(self):
        """
        Total number of hits for all queries.
        """
        return sum(map(len, self._query_hits))


[docs]
    def iter_hits(self):
        """
        Iterator over all hits of all queries. Iterates over
        :class:`.BlastHit` instances
        """
        for query in self._query_hits:
            for hit in query:
                yield hit


    @property
    def num_hsp(self):
        """
        Total number of Hsp's for all hits of all entries.
        """
        return sum([hit.num_hsp() for hit in self._query_hits])


[docs]
    def iter_hsp(self):
        """
        Iterator over all Hsp's of all hits of all queries.
        """
        for query in self._query_hits:
            for hit in query:
                for Hsp in hit:
                    yield Hsp




[docs]
class BlastQueryHits(object):
    """
    Results for a given query of a BLAST run.
    """
    def __init__(self):
        self._num = None
        self._query_ID = None
        self._query_def = None
        self._query_len = None
        self._db_num = None
        self._db_len = None
        self._hsp_len = None
        self._eff_space = None
        self._kappa = None
        self._lambda = None
        self._entropy = None
        self._hits = []

    @property
    def num(self):
        """ index of the query in the BLAST run. """
        return self._num

    @property
    def query_ID(self):
        """Identifier of the query."""
        return self._query_ID

    @property
    def query_len(self):
        """Length of the query."""
        return self._query_len

    @property
    def query_def(self):
        """Description of the query."""
        return self._query_def

    @property
    def db_num(self):
        """Number of sequence in the database."""
        return self._db_num

    @property
    def db_len(self):
        """Number of letters in the database."""
        return self._db_len

    @property
    def hsp_len(self):
        """Length adjustment."""
        return self._hsp_len

    @property
    def eff_space(self):
        """Effective space of the search."""
        return self._eff_space

    @property
    def K(self):
        """Karlin-Altschul kappa parameter."""
        return self._kappa

    @property
    def L(self):
        """Karlin-Altschul lambda parameter."""
        return self._lambda

    @property
    def H(self):
        """Karlin-Altschul entropy parameter."""
        return self._entropy

    @property
    def num_hits(self):
        """
        Number of hits for this query. ``query_hits.num_hits()`` is also
        available as ``len(query_hits)``.
        """
        return len(self._hits)

    def __len__(self):
        return len(self._hits)


[docs]
    def iter_hits(self):
        """
        Iterator to the :class:`.BlastHit` instances of all
        hits. ``for hit in query_hits.iter_hits()`` is also available as
        ``for hit in query_hits``.
        """
        for i in self._hits: yield i


    def __iter__(self):
        for i in self._hits: yield i


[docs]
    def get_hit(self, i):
        """
        Get a given hit, as a :class:`.BlastHit` instance. ``query_hits.get_hit(i)``
        is also available as ``query_hits[i]``.
        """
        return self._hits[i];


    def __getitem__(self, i):
        return self._hits[i]

    @property
    def num_hsp(self):
        """
        Total number of Hsp's for all hits.
        """
        return sum(map(len, self._hits))


[docs]
    def iter_hsp(self):
        """
        Iterator over all Hsp's of all hits, as :class:`.BlastHsp`
        instances
        """
        for hit in self._hits:
            for hsp in hit:
                yield hsp




[docs]
class BlastHit(object):
    """
    Results for a given hit of a BLAST run.
    """
    def __init__(self):
        self._num = None
        self._id = None
        self._def = None
        self._accession = None
        self._len = None
        self._hsp = []

    @property
    def num(self):
        """Index of the hit for the corresponding query."""
        return self._num

    @property
    def id(self):
        """Identifier of the subject."""
        return self._id

    @property
    def descr(self):
        """Description of the subject."""
        return self._def

    @property
    def accession(self):
        """Identifier of the subject."""
        return self._accession

    @property
    def len(self):
        """Length of subject."""
        return self._len

    @property
    def num_hsp(self):
        """
        Number of Hsp's in this hit. ``hit.num_Hsp()  is also available as
        ``len(hit)``.
        """
        return len(self._hsp)

    def __len__(self):
        return len(self._hsp)


[docs]
    def iter_hsp(self):
        """
        Iterator to the :class:`.BlastHsp` instances for all
        Hsp's. ``for hsp in hit.iter_hsp()`` is also available as
        ``for hsp in hit``.
        """
        for i in self._hsp: yield i


    def __iter__(self):
        for i in self._hsp: yield i


[docs]
    def get_hsp(self, i):
        """
        Get a given Hsp, as a :class:`.BlastHsp` instance. ``hit.get_hsp(i)``
        is also available as ``hit[i]``.
        """
        return self._hsp[i];


    def __getitem__(self, i):
        return self._hsp[i]



[docs]
class BlastHsp(object):
    """
    Description of an Hsp of a BLAST run.

    Start and stop positions are always interpreted as range parameters (use
    `frame` to determine if the complement should be used)::

        >>> hit_sequence = seq[query_start:query_to]
    """
    def __init__(self):
        self._num = None
        self._bit_score = None
        self._evalue = None
        self._query_start = None
        self._query_stop = None
        self._query_frame = None
        self._hit_start = None
        self._hit_stop = None
        self._hit_frame = None
        self._identity = None
        self._positive = None
        self._gaps = None
        self._align_len = None
        self._qseq = None
        self._hseq = None
        self._midline = None

    @property
    def num(self):
        """Index of the Hsp in the corresponding hit."""
        return self._num

    @property
    def bit_score(self):
        """Bit score of the Hsp."""
        return self._bit_score

    @property
    def evalue(self):
        """Expectation value of the Hsp."""
        return self._evalue

    @property
    def query_start(self):
        """Start position on the query."""
        return self._query_start

    @property
    def query_stop(self):
        """Stop position on the query."""
        return self._query_stop

    @property
    def hit_start(self):
        """Start position on the subject."""
        return self._hit_start

    @property
    def hit_stop(self):
        """Stop position on the subject."""
        return self._hit_stop

    @property
    def query_frame(self):
        """Frame of the query."""
        return self._query_frame

    @property
    def hit_frame(self):
        """Frame of the hit."""
        return self._hit_frame

    @property
    def identity(self):
        """Number of identical positions."""
        return self._identity

    @property
    def positive(self):
        """Number of positions with positive score."""
        return self._positive

    @property
    def gaps(self):
        """Number of gap positions."""
        return self._gaps

    @property
    def align_len(self):
        """Length of the alignment."""
        return self._align_len

    @property
    def qseq(self):
        """Aligned query sequence."""
        return self._qseq

    @property
    def hseq(self):
        """Aligned subject sequence."""
        return self._hseq

    @property
    def midline(self):
        """Alignment midline."""
        return self._midline