Source code for egglib.wrappers._blast

"""
    Copyright 2009-2023 Stephane De Mita, Mathieu Siol, Thomas Coudoux

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

# cf: https://www.ncbi.nlm.nih.gov/books/NBK279684/
# see also: https://doctorlib.info/medical/blast/7.html (checked for strands)

import subprocess, os, collections, sys
import xml.etree.ElementTree
from .. import _interface, alphabets
from ..tools import _code_tools
from . import _utils

class _BLAST_tool(_utils._App):
    def _check_path(self, path, cfg):
        cmd = (path, '-version')
        try:
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            stdout, stderr = p.communicate()
            if len(stderr): return stderr
        except OSError as e:
            return e.strerror

_blast_apps = {}
for k in 'makeblastdb', 'blastn', 'blastp', 'blastx', 'tblastn', 'tblastx':
    _blast_apps[k] = _BLAST_tool(key=k, default=k)
    _utils.paths._add(_blast_apps[k])

[docs]def makeblastdb(source, dbtype=None, out=None, input_type="fasta", verbose=False, title=None, parse_seqids=False, hash_index=False, mask_data=None, mask_id=None, mask_desc=None, blastdb_version=5, max_file_sz="1GB", taxid=None, taxid_map=None): """ Create a BLAST database. :param source: name of an input file of the appropriate format. If not fasta, the format must be specified using the *input_type* option. Alternatively, *source* can be a :class:`.Container` or :class:`.Align` instance. If so, its alphabet must be DNA or protein and the *dbtype* argument, if specified, must match. Note that passing a :class:`!Container` or :class:`!Align` instance must be avoided for large databases. :param dbtype: database type: ``"nucl"`` or ``"prot"`` are acceptable. Can be omitted if a :class:`!Container` or an :class:`!Align` is provided as *source*. :param out: database name. Must be specified if a :class:`!Container` or an :class:`!Align` is provided as *source*, or if the *input_type* is "blastdb", otherwise the input file name is used as database name. :param input_type: format of input file. Must be ``"fasta"`` if a :class:`!Container` or an :class:`!Align` is provided as *source*. Otherwise must describe the format of *source*: ``"fasta"``, ``"asn1_bin"``, ``"asn1_txt"``, or ``"blastdb"``. :param verbose: display makeblastdb output (by default, it is returned by the function). Errors are always displayed. :param title: database title. A default title is inserted in case a :class:`!Container` or :class:`!Align` instance is passed as *source*. :parse_seqids: parse seqid from sequence names (considered if `input_type` is fasta, including if a :class:`!Container` or an :class:`!Align` is provided as *source*; argument ignored otherwise: seqid is always imported). :hash_index: create index of sequence hash values. :mask_data: list of input files containing masking data. :mask_id: list of strings to uniquely identify the masking algorithm, one for each mask file (requires *mask_data*). :mask_desc: list of free form strings to describe the masking algorithm details, one for each mask file (requires *mask_id*). :blastdb_version: version of BLAST database to be created (4 or 5). :max_file_sz: maximum file size for BLAST database files. :taxid: taxonomy ID to assign to all sequences as an integer (incompatible with *taxid_map*). :taxid_map: text file mapping sequence IDs to taxonomy IDs (requires *parse_seqids*, incompatible with *taxid*). :return: Standard output of the program (`None` if *verbose* was ``True``). Please refer to the manual of BLAST tools for more details. .. versionchanged 3.2:: *blastdb_version* default value was previously 4. *gi_mask* and *gi_mask_name* options are removed. """ # get command name path = _blast_apps['makeblastdb'].get_path() if path is None: raise RuntimeError('makeblastdb program not available -- please configure path') cmd = [path] # process main arguments arguments if isinstance(source, (_interface.Container, _interface.Align)): cmd.extend(['-in', '-']) if source.alphabet == alphabets.DNA and (dbtype is None or dbtype == 'nucl'): cmd.extend(['-dbtype', 'nucl']) elif source.alphabet == alphabets.protein and (dbtype is None or dbtype == 'prot'): cmd.extend(['-dbtype', 'prot']) else: raise ValueError('invalid alphabet, dbtype or alphabet/dbtype mismatch') if input_type != 'fasta': raise ValueError('`input_type` must be "fasta"') if out is None: raise ValueError('`out` is required if an object is passed as `source`') cmd.extend(['-input_type', 'fasta']) if title is None: title = '{0} database from an EggLib {1}'.format(cmd[4], 'Container' if isinstance(source, _interface.Container) else 'Align') else: if not isinstance(source, (os.PathLike, str)): raise TypeError('`source` must be a file name, a Container or an Align') if input_type != 'blastdb' and not os.path.isfile(source): raise ValueError('file not found: {0}'.format(source)) if dbtype is None: raise ValueError('`dbtype` is required') if dbtype not in ['nucl', 'prot']: raise ValueError('invalid value for `dbtype`') if input_type not in ['fasta', 'asn1_bin', 'asn1_txt', 'blastdb']: raise ValueError('invalid value for `input_type`') if input_type == 'blastdb' and out is None or out == source: raise ValueError('a different database name is required for input type `blastdb`') cmd.extend(['-in', source, '-dbtype', dbtype, '-input_type', input_type]) if out is not None: if not isinstance(out, (os.PathLike, str)): raise TypeError('`out` must be a string or a path-like object') cmd.extend(['-out', out]) # check types of other arguments if not isinstance(blastdb_version, int): raise TypeError('`blastdb_version` must be an integer') if blastdb_version not in [4, 5]: raise ValueError('supported values for `blastdb_version` are 4 and 5 only') if not isinstance(max_file_sz, str): raise TypeError('`max_file_sz` must be a string') if title is not None and not isinstance(title, str): raise TypeError('`title` must be a string') if not isinstance(parse_seqids, bool): raise TypeError('`parse_seqids` must be a boolean') if input_type != 'fasta': parse_seqids = True if not isinstance(hash_index, bool): raise TypeError('`hash_index` must be a boolean') if mask_data is not None: if not isinstance(mask_data, (list, tuple)): raise TypeError('`mask_data` must be a list or a tuple') if not all([isinstance(item, str) for item in mask_data]): raise TypeError('`mask_data` items must be strings') if not all(map(os.path.isfile, mask_data)): raise ValueError('at least one file specified in `mask_data` does not exist') if len(mask_data) == 0: raise ValueError('there must be at least one item in `mask_data`') if mask_id is not None: if not isinstance(mask_id, (list, tuple)): raise TypeError('`mask_id` must be a list or a tuple') if not all([isinstance(item, str) for item in mask_id]): raise TypeError('`mask_id` items must be strings') if mask_desc is not None: if not isinstance(mask_desc, (list, tuple)): raise TypeError('`mask_desc` must be a list or a tuple') if not all([isinstance(item, str) for item in mask_desc]): raise TypeError('`mask_desc` items must be strings') if taxid is not None: if not isinstance(taxid, int): raise TypeError('`taxid` must be an integer') if taxid < 0: raise ValueError('`taxid` must be >= 0') if taxid_map is not None: if not isinstance(taxid_map, (str, os.PathLike)): raise TypeError('`taxid_map` must be a string or pathlike object') if not os.path.isfile(taxid_map): raise ValueError('file passed for `taxid_map` not found: {0}'.format(taxid_map)) # check compatibility of other arguments if mask_id is not None: if mask_data is None: raise ValueError('`mask_id` requires `mask_data`') if len(mask_id) != len(mask_data): raise ValueError('`mask_id` must have the same length than `mask_data`') if mask_desc is not None: if mask_id is None: raise ValueError('`mask_desc` requires `mask_id`') if len(mask_desc) != len(mask_data): raise ValueError('`mask_desc` must have the same length than `mask_data`') if taxid is not None and taxid_map is not None: raise ValueError('`taxid` and `taxid_map` are incompatible') if taxid_map is not None and not parse_seqids: raise ValueError('`taxid_map` requires `parse_seqids`') # load the options if title is not None: cmd.extend(['-title', title]) if parse_seqids: cmd.append('-parse_seqids') if hash_index: cmd.append('-hash_index') if mask_data is not None: cmd.extend(['-mask_data', ','.join(mask_data)]) if mask_id is not None: cmd.extend(['-mask_id', ','.join(mask_id)]) if mask_desc is not None: cmd.extend(['-mask_desc', ','.join(mask_desc)]) cmd.extend(['-blastdb_version', str(blastdb_version), '-max_file_sz', max_file_sz]) if taxid: cmd.extend(['-taxid', str(taxid)]) if taxid_map: cmd.extend(['-taxid_map', taxid_map]) # run the program p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=None if verbose else subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) if isinstance(source, (_interface.Container, _interface.Align)): stdin = source.fasta() else: stdin = None stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running makeblastdb (message below)\n' + stderr) return stdout
_blastn_costs = { # r/p default, allowed (1,-2): ( (5,2), [(5,2), (2,2), (1,2), (0,2), (3,1), (2,1), (1,1)] ), (1,-3): ( (5,2), [(5,2), (2,2), (1,2), (0,2), (2,1), (1,1)] ), (1,-4): ( (5,2), [(5,2), (1,2), (0,2), (2,1), (1,1)] ), (2,-3): ( (5,2), [(4,4), (2,4), (0,4), (3,3), (6,2), (5,2), (4,2), (2,2)] ), (4,-5): ( (12,8), [(12,8), (6,5), (5,5), (4,5), (3,5)] ), (1,-1): ( (5,2), [(5,2), (3,2), (2,2), (1,2), (0,2), (4,1), (3,1), (2,1)] )} _blastp_costs = { 'BLOSUM62': [(32767, 32767), (11, 2), (10, 2), (9, 2), (8, 2), (7, 2), (6, 2), (13, 1), (12, 1), (11, 1), (10, 1), (9, 1)], 'BLOSUM80': [(32767, 32767), (25, 2), (13, 2), (9, 2), (8, 2), (7, 2), (6, 2), (11, 1), (10, 1), (9, 1)], 'PAM30': [(32767, 32767), (7, 2), (6, 2), (5, 2), (10, 1), (9, 1), (8, 1), (15, 3), (14, 2), (14, 1), (13, 3)], 'PAM70': [(32767, 32767), (8, 2), (7, 2), (6, 2), (11, 1), (10, 1), (9, 1), (11, 2), (12, 3)] } _blastp_costs_defaults = { 'BLOSUM62': (11, 1), 'BLOSUM80': (10, 1), 'PAM70': (10, 1), 'PAM30': (9, 1) }
[docs]def megablast(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=28, gapopen=5, gapextend=2, reward=1, penalty=-2, strand='both', no_dust=False, no_soft_masking=False, lcase_masking=False, perc_identity=0, no_greedy=False): """ ``megablast`` similarity search. This is designed for strongly similar sequences using a nucleotide query on a nucleotide database. .. include:: blast_options_common.rst .. include:: blast_options_blastn.rst :return: A :class:`.BlastOutput` instance. """ return _blastn(query, db, subject, 'megablast', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, reward, penalty, strand, no_dust, no_soft_masking, lcase_masking, perc_identity, None, None, no_greedy)
[docs]def dc_megablast(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=11, gapopen=None, gapextend=None, reward=2, penalty=-3, strand='both', no_dust=False, no_soft_masking=False, lcase_masking=False, perc_identity=0, template_type='coding', template_length=18): """ Dicontinuous ``megablast`` similarity search. This is designed for similar sequences (less similar than :func:`.megablast`) using a nucleotide query on a nucleotide database. .. include:: blast_options_common.rst .. include:: blast_options_blastn.rst :param template_type: template type for for dc-megablast. Possible values are ``"coding"`` (default), ``"optimal"``, and ``"coding_and_optimal"``. :param template_length: template length for dc-megablast. Possible values are 16, 18 (the default), and 21. """ return _blastn(query, db, subject, 'dc-megablast', query_loc, subject_loc, evalue,parse_deflines, num_threads, word_size, gapopen, gapextend, reward, penalty, strand, no_dust, no_soft_masking, lcase_masking, perc_identity, template_type, template_length, False)
[docs]def blastn(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=11, gapopen=None, gapextend=None, reward=2, penalty=-3, strand='both', no_dust=False, no_soft_masking=False, lcase_masking=False, perc_identity=0): """ ``blastn`` similarity search. This is designed for distant sequences using a nucleotide query on a nucleotide database. .. include:: blast_options_common.rst .. include:: blast_options_blastn.rst :return: A :class:`BlastOutput` instance. """ return _blastn(query, db, subject, 'blastn', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, reward, penalty, strand, no_dust, no_soft_masking, lcase_masking, perc_identity, None, None, False)
[docs]def blastn_short(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=1000, parse_deflines=False, num_threads=1, word_size=7, gapopen=None, gapextend=None, reward=1, penalty=-3, strand='both', no_dust=True, no_soft_masking=True, lcase_masking=False, perc_identity=0): """ ``blastn`` for short sequences. This is optimised for query sequences up to 50 bp long. It automatically sets ``evalue=1000``, ``word_size=7``, ``no_dust=True``, ``no_soft_masking=True``, ``reward=1``, ``penalty=-3``, ``gapopen=5`` and ``gapextend=2``. .. include:: blast_options_common.rst .. include:: blast_options_blastn.rst :return: A :class:`BlastOutput` instance. """ return _blastn(query, db, subject, 'blastn-short', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, reward, penalty, strand, no_dust, no_soft_masking, lcase_masking, perc_identity, None, None, False)
@_utils._protect_run def _blastn(query, db, subject, task, query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, reward, penalty, strand, no_dust, no_soft_masking, lcase_masking, perc_identity, template_type, template_length, no_greedy): cmd, stdin = _get_common_blast_options('blastn', query, db, subject, query_loc, subject_loc, evalue, parse_deflines, num_threads) # process task cmd.extend(['-task', task]) # word size if not isinstance(word_size, int): raise TypeError('`word_size` must be an integer') if task == 'megablast' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for megablast') if task == 'dc-megablast' and word_size not in [11, 12]: raise ValueError('invalid value for `word_size`: must be 11 or 12 for dc-megablast') if task == 'blastn' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for blastn') if task == 'blastn-short' and word_size < 4: raise ValueError('invalid value for `word_size`: must be >= 4 for blastn-short') cmd.extend(['-word_size', str(word_size)]) # reward/penalty if not isinstance(reward, int): raise TypeError('`reward` must be an integer') if reward < 0: raise ValueError('`reward` must be >= 0') if not isinstance(penalty, int): raise TypeError('`penalty` must be an integer') if penalty > 0: raise ValueError('`penalty` must be <= 0') if (reward, penalty) not in _blastn_costs: raise ValueError('invalid set of values for reward/penalty: ({0},{1})'.format(reward, penalty)) # gap costs if gapopen is None: if gapextend is not None: raise ValueError('if `gapopen` is ``None``, `gapextend` should be also') gapopen, gapextend = _blastn_costs[(reward, penalty)][0] elif gapextend is None: raise ValueError('if `gapextend` is ``None``, `gapopen` should be also') if not isinstance(gapopen, int): raise TypeError('`gapopen` must be an integer') if gapopen < 0: raise ValueError('`gapopen` must be >= 0') if not isinstance(gapextend, int): raise TypeError('`gapextend` must be an integer') if gapextend < 0: raise ValueError('`gapextend` must be >= 0') if (gapopen, gapextend) not in _blastn_costs[(reward, penalty)][1]: raise ValueError('invalid set of values for gapopen/gapextend: ({0},{1})'.format(gapopen, gapextend)) cmd.extend(['-gapopen', str(gapopen), '-gapextend', str(gapextend), '-reward', str(reward), '-penalty', str(penalty)]) # strand if not isinstance(strand, str): raise TypeError('`strand` must be a string') if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"') cmd.extend(['-strand', strand]) # filter/mask flags if no_dust: cmd.extend(['-dust', 'no']) if no_soft_masking: cmd.extend(['-soft_masking', 'false']) if lcase_masking: cmd.append('-lcase_masking') # perc_identity if not isinstance(perc_identity, int): raise TypeError('`perc_identity` must be an integer') if perc_identity < 0 or perc_identity > 100: raise ValueError('`perc_identity` out of range') cmd.extend(['-perc_identity', str(perc_identity)]) # template parameters if task == 'dc-megablast': if not isinstance(template_type, str): raise TypeError('`template_type` must be a string') if template_type not in ['coding', 'optimal', 'coding_and_optimal']: raise ValueError('invalid value for `template_type`') if not isinstance(template_length, int): raise TypeError('`template_length` must be an integer') if template_length not in [16, 18, 21]: raise ValueError('invalid value for `template_length`') cmd.extend(['-template_type', template_type]) cmd.extend(['-template_length', str(template_length)]) # aligment flags if no_greedy: cmd.append('-no_greedy') # run the program p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running blastn (message below)\n' + stderr) return _parse_blast_output(stdout)
[docs]def blastp(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=11, comp_based_stats=2, seg=0, soft_masking=False, lcase_masking=False, window_size=40, use_sw_tback=False): """ ``bastp`` similary search. This is designed for using a protein query on a protein database. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param parse_deflines: parse query and subject bar delimited sequence identifiers. :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :param soft_masking: apply filtering locations as soft masks. :param lcase_masking: use lower case filtering in query and subject sequences. :param use_sw_tback: compute locally optimal Smith-Waterman alignments. :return: A :class:`BlastOutput` instance. """ return _blastp(query, db, subject, 'blastp', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, matrix, threshold, comp_based_stats, seg, soft_masking, lcase_masking, window_size, use_sw_tback)
[docs]def blastp_short(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='PAM30', threshold=16, comp_based_stats=0, seg=0, lcase_masking=False, window_size=15, use_sw_tback=False): """ ``blastp`` similarity search for short sequences. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param parse_deflines: parse query and subject bar delimited sequence identifiers. :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :param lcase_masking: use lower case filtering in query and subject sequences. :param use_sw_tback: compute locally optimal Smith-Waterman alignments. :return: A :class:`BlastOutput` instance. """ return _blastp(query, db, subject, 'blastp-short', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, matrix, threshold, comp_based_stats, seg, False, lcase_masking, window_size, use_sw_tback)
[docs]def blastp_fast(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, parse_deflines=False, num_threads=1, word_size=None, threshold=21, comp_based_stats=2, seg=0, lcase_masking=False, window_size=40, use_sw_tback=False): """ Quick ``blastp`` similarity search. .. include:: blast_options_common.rst :param threshold: minimum word score such that the word is added to the BLAST lookup table (>0). :param seg: filter query sequence with SEG as an integer (0 to disable, 1 to enable, or alternatively a tuple with the three parameters *window*, *locut*, and *hicut*). :param window_size: multiple hits window size (use 0 to specify 1-hit algorithm). :param parse_deflines: parse query and subject bar delimited sequence identifiers. :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :param lcase_masking: use lower case filtering in query and subject sequences. :param use_sw_tback: compute locally optimal Smith-Waterman alignments. :return: A :class:`BlastOutput` instance. """ return _blastp(query, db, subject, 'blastp-fast', query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, None, None, None, threshold, comp_based_stats, seg, False, lcase_masking, window_size, use_sw_tback)
@_utils._protect_run def _blastp(query, db, subject, task, query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, matrix, threshold, comp_based_stats, seg, soft_masking, lcase_masking, window_size, use_sw_tback): cmd, stdin = _get_common_blast_options('blastp', query, db, subject, query_loc, subject_loc, evalue, parse_deflines, num_threads) _get_common_protein_options(cmd, task, word_size, matrix, threshold, gapopen, gapextend, seg, window_size, comp_based_stats) # process task cmd.extend(['-task', task]) # filter/mask flags if task == 'blastp': if soft_masking: cmd.extend(['-soft_masking', 'true']) else: cmd.extend(['-soft_masking', 'false']) if lcase_masking: cmd.append('-lcase_masking') # use_sw_tback if use_sw_tback: cmd.append('-use_sw_tback') # run the program p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running blastp (message below)\n' + stderr) return _parse_blast_output(stdout)
[docs]def blastx(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=12, seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False, window_size=40, strand='both', query_genetic_code=1, max_intron_length=0, comp_based_stats=2): """ ``blastx`` similarity search. Designed for using a translated nucleotide query on a protein database. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param soft_masking: apply filtering locations as soft masks. :param lcase_masking: use lower case filtering in query and subject sequences. :param query_genetic_code: genetic code to translate query. Allowed values are: 1-6, 9-16, 21-25. :param max_intron_length: length of the largest intron allowed in a translated nucleotide sequence when linking multiple distinct alignments (a negative value disables linking). :param strand: query strand(s) to search against database/subject. Choice of both, minus, or plus. :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :return: A :class:`BlastOutput` instance. """ return _blastx('blastx', query, db, subject, query_loc, subject_loc, evalue, False, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, lcase_masking, window_size, strand, query_genetic_code, max_intron_length, comp_based_stats)
[docs]def blastx_fast(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=21, seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False, window_size=40, strand='both', query_genetic_code=1, max_intron_length=0, comp_based_stats=2): """ Quick ``blastx`` similarity search. Designed for using a translated nucleotide query on a protein database and optimised for faster execution. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param soft_masking: apply filtering locations as soft masks. :param lcase_masking: use lower case filtering in query and subject sequences. :param query_genetic_code: genetic code to translate query. Allowed values are: 1-6, 9-16, 21-25. :param max_intron_length: length of the largest intron allowed in a translated nucleotide sequence when linking multiple distinct alignments (a negative value disables linking). :param strand: query strand(s) to search against database/subject. Choice of both, minus, or plus. :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :return: A :class:`BlastOutput` instance. """ return _blastx('blastx-fast', query, db, subject, query_loc, subject_loc, evalue, False, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, lcase_masking, window_size, strand, query_genetic_code, max_intron_length, comp_based_stats)
@_utils._protect_run def _blastx(task, query, db, subject, query_loc, subject_loc, evalue, parse_deflines, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, lcase_masking, window_size, strand, query_genetic_code, max_intron_length, comp_based_stats): cmd, stdin = _get_common_blast_options('blastx', query, db, subject, query_loc, subject_loc, evalue, parse_deflines, num_threads) cmd.extend(['-task', task]) _get_common_protein_options(cmd, task, word_size, matrix, threshold, gapopen, gapextend, seg, window_size, comp_based_stats) if soft_masking: cmd.extend(['-soft_masking', 'true']) else: cmd.extend(['-soft_masking', 'false']) if lcase_masking: cmd.append('-lcase_masking') if not isinstance(strand, str): raise TypeError('`strand` must be a string') if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"') cmd.extend(['-strand', strand]) if query_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]: raise ValueError('invalid genetic code: {0}'.format(query_genetic_code)) cmd.extend(['-query_gencode', str(query_genetic_code)]) if not isinstance(max_intron_length, int): raise TypeError('`max_intron_length` must be an integer') if max_intron_length < 0: raise ValueError('`max_intron_length` must be >= 0') cmd.extend(['-max_intron_length', str(max_intron_length)]) # run the program p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running blastx (message below)\n' + stderr) return _parse_blast_output(stdout)
[docs]def tblastn(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=13, seg=(12, 2.2, 2.5), soft_masking=False, window_size=40, db_genetic_code=1, max_intron_length=0, comp_based_stats=2): """ ``tblastn`` similary search. Designed for using a protein query on a translated nucleotide database. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param soft_masking: apply filtering locations as soft masks. :param db_genetic_code: genetic code to translate subject sequences. Allowed values are: 1-6, 9-16, 21-25. :param max_intron_length: length of the largest intron allowed in a translated nucleotide sequence when linking multiple distinct alignments (a negative value disables linking). :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :return: A :class:`BlastOutput` instance. """ return _tblastn('tblastn', query, db, subject, query_loc, subject_loc, evalue, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, window_size, db_genetic_code, max_intron_length, comp_based_stats)
[docs]def tblastn_fast(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, num_threads=1, word_size=None, gapopen=None, gapextend=None, matrix='BLOSUM62', threshold=21, seg=(12, 2.2, 2.5), soft_masking=False, window_size=40, db_genetic_code=1, max_intron_length=0, comp_based_stats=2): """ Quich ``tblastn`` similary search. Designed for using a protein query on a translated nucleotide database and optimised for fast execution. .. include:: blast_options_common.rst .. include:: blast_options_prot.rst :param soft_masking: apply filtering locations as soft masks. :param db_genetic_code: genetic code to translate subject sequences. Allowed values are: 1-6, 9-16, 21-25. :param max_intron_length: length of the largest intron allowed in a translated nucleotide sequence when linking multiple distinct alignments (a negative value disables linking). :param comp_based_stats: composition-based statistics, as an integer code: 0 (no composition-based statistics), 1 (composition-based statistics as in NAR 29:2994-3005, 2001), 2 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties), or 3 (composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally). :return: A :class:`BlastOutput` instance. """ return _tblastn('tblastn-fast', query, db, subject, query_loc, subject_loc, evalue, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, window_size, db_genetic_code, max_intron_length, comp_based_stats)
@_utils._protect_run def _tblastn(task, query, db, subject, query_loc, subject_loc, evalue, num_threads, word_size, gapopen, gapextend, matrix, threshold, seg, soft_masking, window_size, db_genetic_code, max_intron_length, comp_based_stats): # process common options cmd, stdin = _get_common_blast_options('tblastn', query, db, subject, query_loc, subject_loc, evalue, False, num_threads) _get_common_protein_options(cmd, task, word_size, matrix, threshold, gapopen, gapextend, seg, window_size, comp_based_stats) # process task cmd.extend(['-task', task]) # filter/mask flags if soft_masking: cmd.extend(['-soft_masking', 'true']) else: cmd.extend(['-soft_masking', 'false']) # db_genetic_code if db_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]: raise ValueError('invalid genetic code: {0}'.format(db_genetic_code)) cmd.extend(['-db_gencode', str(db_genetic_code)]) # max_intron_length if not isinstance(max_intron_length, int): raise TypeError('`max_intron_length` must be an integer') if max_intron_length < 0: raise ValueError('`max_intron_length` must be >= 0') cmd.extend(['-max_intron_length', str(max_intron_length)]) # run the program p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running tblastn (message below)\n' + stderr) return _parse_blast_output(stdout) @_utils._protect_run def tblastx(query, db=None, subject=None, query_loc=None, subject_loc=None, evalue=10, num_threads=1, word_size=None, matrix='BLOSUM62', seg=(12, 2.2, 2.5), soft_masking=False, lcase_masking=False, window_size=40, db_genetic_code=1, query_genetic_code=1, strand='both'): """ ``tblastx`` similary search. Designed for using a translated nucleotide query on a translated nucleotide database. .. include:: blast_options_common.rst :param matrix: scoring matrix name. Available values are: PAM-30, PAM-70, BLOSUM-80, and BLOSUM-62. :param seg: filter query sequence with SEG as an integer (0 to disable, 1 to enable, or alternatively a tuple with the three parameters *window*, *locut*, and *hicut*). :param window_size: multiple hits window size (use 0 to specify 1-hit algorithm). :param soft_masking: apply filtering locations as soft masks. :param lcase_masking: use lower case filtering in query and subject sequences. :param query_genetic_code: genetic code to translate query. Allowed values are: 1-6, 9-16, 21-25. :param db_genetic_code: genetic code to translate subject sequences. Allowed values are: 1-6, 9-16, 21-25. :param strand: query strand(s) to search against database/subject. Choice of both, minus, or plus. :return: A :class:`BlastOutput` instance. """ # process common options cmd, stdin = _get_common_blast_options('tblastx', query, db, subject, query_loc, subject_loc, evalue, False, num_threads) _get_common_protein_options(cmd, 'tblastx', word_size, matrix, None, None, None, seg, window_size, None) # filter/mask flags if soft_masking: cmd.extend(['-soft_masking', 'true']) else: cmd.extend(['-soft_masking', 'false']) if lcase_masking: cmd.append('-lcase_masking') # query_genetic_code if query_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]: raise ValueError('invalid genetic code: {0}'.format(query_genetic_code)) cmd.extend(['-query_gencode', str(query_genetic_code)]) # db_genetic_code if db_genetic_code not in [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25]: raise ValueError('invalid genetic code: {0}'.format(db_genetic_code)) cmd.extend(['-db_gencode', str(db_genetic_code)]) # strand if not isinstance(strand, str): raise TypeError('`strand` must be a string') if strand not in ['both', 'minus', 'plus']: raise ValueError('`strand` must be either "both", "minus", or "plus"') cmd.extend(['-strand', strand]) # run the program p = subprocess.Popen(cmd, stdin=None if stdin is None else subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate(stdin) if p.returncode != 0: raise RuntimeError('error while running tblastx (message below)\n' + stderr) return _parse_blast_output(stdout) def _get_common_blast_options(command, query, db, subject, query_loc, subject_loc, evalue, parse_deflines, num_threads): # get command name path = _blast_apps[command].get_path() if path is None: raise RuntimeError('{0} program not available -- please configure path'.format(command)) cmd = [path, '-query', '-', '-outfmt', '5'] # process query if isinstance(query, str): stdin = '>lcl|Query\n' + query + '\n' queryL = len(query) elif isinstance(query, _interface.SequenceView): if command in ['blastn', 'blastx', 'tblastx'] and query._parent._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA') if command in ['blastp', 'tblastn'] and query._parent._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein') stdin = '>lcl|Query\n' + query.string() + '\n' queryL = len(query) elif isinstance(query, _interface.SampleView): if command in ['blastn', 'blastx', 'tblastx'] and query._parent._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA') if command in ['blastp', 'tblastn'] and query._parent._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein') stdin = '>lcl|{0}\n{1}\n'.format(query.name, query.sequence.string()) queryL = len(query.sequence) elif isinstance(query, (_interface.Align, _interface.Container)): if command in ['blastn', 'blastx', 'tblastx'] and query._alphabet != alphabets.DNA: raise ValueError('query alphabet must be DNA') if command in ['blastp', 'tblastn'] and query._alphabet != alphabets.protein: raise ValueError('query alphabet must be protein') stdin = query.fasta() else: raise TypeError('`query` must be an Align, a Container, a SampleView, a SequenceView, or a string') # process db/subject if db is not None and subject is not None: raise ValueError('only one of `db` and `subject` must be specified') if db is None and subject is None: raise ValueError('either `db` or `subject` must be specified') elif db is not None: if not isinstance(db, str): raise ValueError('`db` must be a string') if not os.path.isabs(db): db = os.path.normpath(os.path.join(_utils._protect_path_mapping[os.getcwd()], db)) cmd.extend(['-db', db]) else: f = open('in.fas', 'w') if isinstance(subject, str): f.write('>lcl|Subject\n{0}\n'.format(subject)) subjectL = len(subject) elif isinstance(subject, _interface.SequenceView): if command in ['blastn', 'tblastn', 'tblastx'] and subject._parent._alphabet != alphabets.DNA: raise ValueError('subject alphabet must be DNA') if command in ['blastp', 'blastx'] and subject._parent._alphabet != alphabets.protein: raise ValueError('subject alphabet must be protein') f.write('>lcl|Subject\n{0}\n'.format(subject.string())) subjectL = len(subject) elif isinstance(subject, _interface.SampleView): if command in ['blastn', 'tblastn', 'tblastx'] and subject._parent._alphabet != alphabets.DNA: raise ValueError('subject alphabet must be DNA') if command in ['blastp', 'blastx'] and subject._parent._alphabet != alphabets.protein: raise ValueError('subject alphabet must be protein') f.write('>lcl|{0}\n{1}\n'.format(subject.name, subject.sequence.string())) subjectL = len(subject.sequence) else: raise TypeError('`subject` must be a SampleView, a SequenceView, or a string') cmd.extend(['-subject', 'in.fas']) # process query_loc if query_loc is not None: if isinstance(query, _interface.Container): raise ValueError('`query_loc` is not supported if `query` is a Container') if not isinstance(query_loc, (list, tuple)): raise TypeError('`query_loc` must be a list or a tuple') if len(query_loc) != 2: raise ValueError('`query_loc` must have two items') if not isinstance(query_loc[0], int) or not isinstance(query_loc[1], int): raise TypeError('`query_loc` must contain two integers') if query_loc[0] >= queryL or query_loc[0] < 0 or query_loc[1] < query_loc[0] + 1: raise ValueError('invalid values for `query_loc`') cmd.extend(['-query_loc', '{0}-{1}'.format(query_loc[0]+1, query_loc[1])]) # process subject_loc if subject_loc is not None: if subject is None: raise ValueError('`subject_loc` cannot be specified if `subject` is not specified') if not isinstance(subject_loc, (list, tuple)): raise TypeError('`subject_loc` must be a list or a tuple') if len(subject_loc) != 2: raise ValueError('`subject_loc` must have two items') if not isinstance(subject_loc[0], int) or not isinstance(subject_loc[1], int): raise TypeError('`subject_loc` must contain two integers') if subject_loc[0] >= subjectL or subject_loc[0] < 0 or subject_loc[1] < subject_loc[0] + 1: raise ValueError('invalid values for `subject_loc`') cmd.extend(['-subject_loc', '{0}-{1}'.format(subject_loc[0]+1, subject_loc[1])]) # process evalue if evalue is not None: if not isinstance(evalue, (int, float)): raise TypeError('`evalue` must be a real value') if evalue <= 0: raise ValueError('`evalue` must be > 0') cmd.extend(['-evalue', str(evalue)]) # parse_deflines if parse_deflines: cmd.append('-parse_deflines') # num_threads if not isinstance(num_threads, int): raise TypeError('`num_threads` must be an integer') if num_threads < 1: raise ValueError('`num_threads` must be >= 1') if num_threads > 1 and subject is not None: raise ValueError('`num_threads cannot be > 1 if `subject` is used') cmd.extend(['-num_threads', str(num_threads)]) return cmd, stdin def _get_common_protein_options(cmd, task, word_size, matrix, threshold, gapopen, gapextend, seg, window_size, comp_based_stats): # word size if word_size is not None: if not isinstance(word_size, int): raise TypeError('`word_size` must be an integer') if word_size < 2 or word_size > 7: raise ValueError('invalid value for `word_size`: must be in range [2,7]') cmd.extend(['-word_size', str(word_size)]) # matrix if matrix is not None: if matrix not in ['PAM30', 'PAM70', 'BLOSUM80', 'BLOSUM62']: raise ValueError('invalid value for `matrix`: {0}'.format(matrix)) cmd.extend(['-matrix', matrix]) # gap costs if task not in ['blastp-fast', 'tblastx']: if gapopen is None: if gapextend is not None: raise ValueError('if `gapopen is ``None``, `gapextend` must be also') gapopen, gapextend = _blastp_costs_defaults[matrix] else: if gapextend is None: raise ValueError('if `gapextend is ``None``, `gapopen` must be also') if not isinstance(gapopen, int): raise TypeError('`gapopen` must be an integer') if not isinstance(gapextend, int): raise TypeError('`gapextend` must be an integer') if (gapopen, gapextend) not in _blastp_costs[matrix]: raise ValueError('invalid gapopen/gapextend combination for {0}'.format(matrix)) cmd.extend(['-gapopen', str(gapopen), '-gapextend', str(gapextend)]) # threshold if threshold is not None: if not isinstance(threshold, int): raise TypeError('`threshold` must be an integer') if threshold <= 0: raise ValueError('invalid value for `treshold`: must be > 0') cmd.extend(['-threshold', str(threshold)]) # seg if seg is not None: if seg == 0: seg = 'no' elif seg == 1: seg = 'yes' elif not isinstance(seg, (tuple, list)): raise ValueError('invalid value for `seg`: must be 0, 1, or a tuple/list of 3 items') else: try: window, locut, hicut = seg window = int(window) except ValueError: raise ValueError('invalid value for `seg`') if window < 1: raise ValueError('`window` (seg parameter) must be > 0') locut = float(locut) if locut <= 0: raise ValueError('`locut` (seg parameter) must be > 0') hicut = float(hicut) if hicut <= locut: raise ValueError('`hicut` (seg parameter) must be > `locut`') seg = '{0} {1} {2}'.format(window, locut, hicut) cmd.extend(['-seg', seg]) # window_size if window_size is not None: if not isinstance(window_size, int): raise TypeError('`window_size` must be an integer') if window_size < 0: raise ValueError('invalid value for `window_size`: {0}'.format(window_size)) cmd.extend(['-window_size', str(window_size)]) # comp_based_stats if comp_based_stats is not None: if comp_based_stats not in [0, 1, 2, 3]: raise ValueError('invalid value for `comp_based_stats`') cmd.extend(['-comp_based_stats', str(comp_based_stats)]) def _xml_find(node, tag): res = node.find(tag) if res is None: raise ValueError('invalid format of BLAST XML output: cannot find `{0}` under `{1}`'.format(tag, node.tag)) return res def _xml_find_type(node, tag, type_): value = _xml_find(node, tag).text try: return type_(value) except ValueError: raise ValueError('invalid format of BLAST XML output: invalid value for `{0}`: {1}'.format(tag, value)) def _parse_blast_output(string): root = xml.etree.ElementTree.fromstring(string) if root.tag != 'BlastOutput': raise ValueError('invalid format of BLAST XML output: no `BlastOutput` at root') out = BlastOutput() out._program = _xml_find(root, 'BlastOutput_program').text out._version = _xml_find(root, 'BlastOutput_version').text out._reference = _xml_find(root, 'BlastOutput_reference').text out._db = _xml_find(root, 'BlastOutput_db').text out._query_ID = _xml_find(root, 'BlastOutput_query-ID').text out._query_def = _xml_find(root, 'BlastOutput_query-def').text out._query_len = _xml_find_type(root, 'BlastOutput_query-len', int) node = _xml_find(root, 'BlastOutput_param') nodes = node.findall('Parameters') if len(nodes) != 1: raise ValueError('invalid format of BLAST XML output: expect one set of BLAST parameters') node = nodes[0] out._params = {} for child in node: if child.tag[:11] != 'Parameters_': raise ValueError('invalid parameter in BLAST output: {0}'.format(child.tag)) tag = child.tag[11:] if tag in out._params: raise ValueError('parameter repeated in BLAST output: {0}'.format(child.tag)) try: out._params[tag] = int(child.text) except ValueError: try: out._params[tag] = float(child.text) except ValueError: out._params[tag] = child.text iterations = _xml_find(root, 'BlastOutput_iterations') for query_node in iterations: if query_node.tag != 'Iteration': raise ValueError('invalid format of BLAST XML output: unexpected node under `BlastOutput_iterations`: {0}'.format(query_node.tag)) query_hits = BlastQueryHits() i = _xml_find_type(query_node, 'Iteration_iter-num', int) if i != len(out._query_hits) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Iteration_iter-num`: {0}'.format(i)) query_hits._num = i - 1 query_hits._query_ID = _xml_find(query_node, 'Iteration_query-ID').text query_hits._query_def = _xml_find(query_node, 'Iteration_query-def').text query_hits._query_len = _xml_find_type(query_node, 'Iteration_query-len', int) node = _xml_find(query_node, 'Iteration_stat') nodes = node.findall('Statistics') if len(nodes) != 1: raise ValueError('invalid format of BLAST XML output: expect one set of iteration statistics') node = nodes[0] query_hits._db_num = _xml_find_type(node, 'Statistics_db-num', int) query_hits._db_len = _xml_find_type(node, 'Statistics_db-len', int) query_hits._hsp_len = _xml_find_type(node, 'Statistics_hsp-len', int) query_hits._eff_space = _xml_find_type(node, 'Statistics_eff-space', int) query_hits._kappa = _xml_find_type(node, 'Statistics_kappa', float) query_hits._lambda = _xml_find_type(node, 'Statistics_lambda', float) query_hits._entropy = _xml_find_type(node, 'Statistics_entropy', float) out._query_hits.append(query_hits) hits = _xml_find(query_node, 'Iteration_hits') for hit_node in hits: if hit_node.tag != 'Hit': raise ValueError('invalid format of BLAST XML output: unexpected node under `Iteration_hits`: {0}'.format(hit_node.tag)) hit = BlastHit() i = _xml_find_type(hit_node, 'Hit_num', int) if i != len(query_hits._hits) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Hit_num`: {0}'.format(i)) hit._num = i - 1 hit._id = _xml_find(hit_node, 'Hit_id').text hit._def = _xml_find(hit_node, 'Hit_def').text hit._accession = _xml_find(hit_node, 'Hit_accession').text hit._len = _xml_find_type(hit_node, 'Hit_len', int) query_hits._hits.append(hit) Hsps = _xml_find(hit_node, 'Hit_hsps') for Hsp_node in Hsps: if Hsp_node.tag != 'Hsp': raise ValueError('invalid format of BLAST XML output: unexpected node under `Hit_hsps`: {0}'.format(Hsp_node.tag)) Hsp = BlastHsp() i = _xml_find_type(Hsp_node, 'Hsp_num', int) if i != len(hit._hsp) + 1: raise ValueError('invalid format of BLAST XML output: unexpected value for `Hsp_num`: {0}'.format(i)) Hsp._num = i - 1 Hsp._bit_score = _xml_find_type(Hsp_node, 'Hsp_bit-score', float) Hsp._evalue = _xml_find_type(Hsp_node, 'Hsp_evalue', float) Hsp._identity = _xml_find_type(Hsp_node, 'Hsp_identity', int) Hsp._gaps = _xml_find_type(Hsp_node, 'Hsp_gaps', int) Hsp._positive = _xml_find_type(Hsp_node, 'Hsp_positive', int) Hsp._align_len = _xml_find_type(Hsp_node, 'Hsp_align-len', int) Hsp._qseq = _xml_find(Hsp_node, 'Hsp_qseq').text Hsp._midline = _xml_find(Hsp_node, 'Hsp_midline').text Hsp._hseq = _xml_find(Hsp_node, 'Hsp_hseq').text if len(Hsp._qseq) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `qseq` and `align-len`') if len(Hsp._hseq) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `hseq` and `align-len`') if len(Hsp._midline) != Hsp._align_len: raise ValueError('invalid format of BLAST XML output: incompatibility between `midline` and `align-len`') qfrom = _xml_find_type(Hsp_node, 'Hsp_query-from', int) qto = _xml_find_type(Hsp_node, 'Hsp_query-to', int) Hsp._query_frame = _xml_find_type(Hsp_node, 'Hsp_query-frame', int) if out._program in ['blastp', 'tblastn']: if Hsp._query_frame != 0: raise ValueError('invalid value for `Hsp_query-frame`: {0}'.format(Hsp._query_frame)) if qto <= qfrom: raise ValueError('invalid values for `Hsp_query-from/to`: {0} and {1} with positive frame'.format(qfrom, qto)) Hsp._query_start = qfrom - 1 Hsp._query_stop = qto else: if Hsp._query_frame not in [-3, -2, -1, 1, 2, 3]: raise ValueError('invalid value for `Hsp_query-frame`: {0}'.format(Hsp._query_frame)) qfrom, qto = sorted([qfrom, qto]) Hsp._query_start = qfrom - 1 Hsp._query_stop = qto hfrom = _xml_find_type(Hsp_node, 'Hsp_hit-from', int) hto = _xml_find_type(Hsp_node, 'Hsp_hit-to', int) Hsp._hit_frame = _xml_find_type(Hsp_node, 'Hsp_hit-frame', int) if out._program in ['blastp', 'blastx']: if Hsp._hit_frame != 0: raise ValueError('invalid value for `Hsp_hit-frame`: {0}'.format(Hsp._hit_frame)) if hto <= hfrom: raise ValueError('invalid values for `Hsp_hit-from/to`: {0} and {1} with positive frame'.format(hfrom, hto)) Hsp._hit_start = hfrom - 1 Hsp._hit_stop = hto else: if Hsp._hit_frame not in [-3, -2, -1, 1, 2, 3]: raise ValueError('invalid value for `Hsp_hit-frame`: {0}'.format(Hsp._hit_frame)) hfrom, hto = sorted([hfrom, hto]) Hsp._hit_start = hfrom - 1 Hsp._hit_stop = hto hit._hsp.append(Hsp) return out
[docs]class BlastOutput(object): """ Full results of a BLAST run. """ def __init__(self): self._program = None self._version = None self._reference = None self._db = None self._query_ID = None self._query_def = None self._query_len = None self._params = None self._query_hits = [] @property def program(self): """ Name of the program used.""" return self._program @property def version(self): """ Version of the program. """ return self._version @property def reference(self): """ Bibliographic reference. """ return self._reference @property def db(self): """ Name of the database used. """ return self._db @property def query_ID(self): """ Identifier of the query. """ return self._query_ID @property def query_def(self): """ Description of the query. """ return self._query_def @property def query_len(self): """ Length of the query. """ return self._query_len @property def params(self): """ Search parameters. ``"expect"``: E-value, ``"reward"``: nucleotide match reward, ``"penalty"``: nucleotide mismatch reward, ``"gapopen"``: cost for opening a gap, ``"gapextend"``: cost for extending a gap. ``"filter"``: filter string. """ return self._params @property def num_queries(self): """ Number of queries used in the BLAST search. ``blast_output.num_queries`` is also available as ``len(blast_output)``. """ return len(self._query_hits) def __len__(self): return len(self._query_hits)
[docs] def iter_queries(self): """ Iterator over queries. Allows to iterate over :class:`.BlastQueryHits` instances for all queries. ``for query_hit in blast_output.iter_queries()`` is also available as ``for query_hit in blast_output``. """ return self.__iter__()
def __iter__(self): for i in self._query_hits: yield i
[docs] def get_query(self, i): """ Hits for a given query. An instance of :class:`.BlastQueryHits` is returned. ``blast_output.get_query(i)`` is also available as ``blast_output[i]``. """ return self._query_hits[i]
def __getitem__(self, i): return self._query_hits[i] @property def num_hits(self): """ Total number of hits for all queries. """ return sum(map(len, self._query_hits))
[docs] def iter_hits(self): """ Iterator over all hits of all queries. Iterates over :class:`.BlastHit` instances """ for query in self._query_hits: for hit in query: yield hit
@property def num_hsp(self): """ Total number of Hsp's for all hits of all entries. """ return sum([hit.num_hsp() for hit in self._query_hits])
[docs] def iter_hsp(self): """ Iterator over all Hsp's of all hits of all queries. """ for query in self._query_hits: for hit in query: for Hsp in hit: yield Hsp
[docs]class BlastQueryHits(object): """ Results for a given query of a BLAST run. """ def __init__(self): self._num = None self._query_ID = None self._query_def = None self._query_len = None self._db_num = None self._db_len = None self._hsp_len = None self._eff_space = None self._kappa = None self._lambda = None self._entropy = None self._hits = [] @property def num(self): """ index of the query in the BLAST run. """ return self._num @property def query_ID(self): """Identifier of the query.""" return self._query_ID @property def query_len(self): """Length of the query.""" return self._query_len @property def query_def(self): """Description of the query.""" return self._query_def @property def db_num(self): """Number of sequence in the database.""" return self._db_num @property def db_len(self): """Number of letters in the database.""" return self._db_len @property def hsp_len(self): """Length adjustment.""" return self._hsp_len @property def eff_space(self): """Effective space of the search.""" return self._eff_space @property def K(self): """Karlin-Altschul kappa parameter.""" return self._kappa @property def L(self): """Karlin-Altschul lambda parameter.""" return self._lambda @property def H(self): """Karlin-Altschul entropy parameter.""" return self._entropy @property def num_hits(self): """ Number of hits for this query. ``query_hits.num_hits()`` is also available as ``len(query_hits)``. """ return len(self._hits) def __len__(self): return len(self._hits)
[docs] def iter_hits(self): """ Iterator to the :class:`.BlastHit` instances of all hits. ``for hit in query_hits.iter_hits()`` is also available as ``for hit in query_hits``. """ for i in self._hits: yield i
def __iter__(self): for i in self._hits: yield i
[docs] def get_hit(self, i): """ Get a given hit, as a :class:`.BlastHit` instance. ``query_hits.get_hit(i)`` is also available as ``query_hits[i]``. """ return self._hits[i];
def __getitem__(self, i): return self._hits[i] @property def num_hsp(self): """ Total number of Hsp's for all hits. """ return sum(map(len, self._hits))
[docs] def iter_hsp(self): """ Iterator over all Hsp's of all hits, as :class:`.BlastHsp` instances """ for hit in self._hits: for hsp in hit: yield hsp
[docs]class BlastHit(object): """ Results for a given hit of a BLAST run. """ def __init__(self): self._num = None self._id = None self._def = None self._accession = None self._len = None self._hsp = [] @property def num(self): """Index of the hit for the corresponding query.""" return self._num @property def id(self): """Identifier of the subject.""" return self._id @property def descr(self): """Description of the subject.""" return self._def @property def accession(self): """Identifier of the subject.""" return self._accession @property def len(self): """Length of subject.""" return self._len @property def num_hsp(self): """ Number of Hsp's in this hit. ``hit.num_Hsp() is also available as ``len(hit)``. """ return len(self._hsp) def __len__(self): return len(self._hsp)
[docs] def iter_hsp(self): """ Iterator to the :class:`.BlastHsp` instances for all Hsp's. ``for hsp in hit.iter_hsp()`` is also available as ``for hsp in hit``. """ for i in self._hsp: yield i
def __iter__(self): for i in self._hsp: yield i
[docs] def get_hsp(self, i): """ Get a given Hsp, as a :class:`.BlastHsp` instance. ``hit.get_hsp(i)`` is also available as ``hit[i]``. """ return self._hsp[i];
def __getitem__(self, i): return self._hsp[i]
[docs]class BlastHsp(object): """ Description of an Hsp of a BLAST run. Start and stop positions are always interpreted as range parameters (use `frame` to determine if the complement should be used):: >>> hit_sequence = seq[query_start:query_to] """ def __init__(self): self._num = None self._bit_score = None self._evalue = None self._query_start = None self._query_stop = None self._query_frame = None self._hit_start = None self._hit_stop = None self._hit_frame = None self._identity = None self._positive = None self._gaps = None self._align_len = None self._qseq = None self._hseq = None self._midline = None @property def num(self): """Index of the Hsp in the corresponding hit.""" return self._num @property def bit_score(self): """Bit score of the Hsp.""" return self._bit_score @property def evalue(self): """Expectation value of the Hsp.""" return self._evalue @property def query_start(self): """Start position on the query.""" return self._query_start @property def query_stop(self): """Stop position on the query.""" return self._query_stop @property def hit_start(self): """Start position on the subject.""" return self._hit_start @property def hit_stop(self): """Stop position on the subject.""" return self._hit_stop @property def query_frame(self): """Frame of the query.""" return self._query_frame @property def hit_frame(self): """Frame of the hit.""" return self._hit_frame @property def identity(self): """Number of identical positions.""" return self._identity @property def positive(self): """Number of positions with positive score.""" return self._positive @property def gaps(self): """Number of gap positions.""" return self._gaps @property def align_len(self): """Length of the alignment.""" return self._align_len @property def qseq(self): """Aligned query sequence.""" return self._qseq @property def hseq(self): """Aligned subject sequence.""" return self._hseq @property def midline(self): """Alignment midline.""" return self._midline