Source code for egglib.io._ms
"""
Copyright 2018-2023 Stephane De Mita, Mathieu Siol
This file is part of EggLib.
EggLib is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EggLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with EggLib. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
from io import StringIO
from .. import _interface
[docs]def to_ms(aln, outfile, positions=None, alphabet=None, as_codes=False,
converter=None, spacer='auto', header='//\n'):
"""
Export data in the ``ms`` program format. The program is described in
Hudson (2002 [*Bioinformatics* **18**: 337-338]).
The format is as follows:
* One header line with two slashes (can be extended/replaced).
* One line with the number of sites
* One line with the positions, only if the number of sites is larger
than zero.
* The matrix of genotypes (one line per sample), only if the number
of sites is larger than zero.
* One empty line.
``ms`` generates binary data (0/1). This function may export any
integer values depending on the data contained in the input object.
:param data: :class:`.Align` instance.
:param outfile: where to write data. This argument can be (i) a file
(or file-like) object open for writing, (ii) the name of a file
(which will be created) as a string, or (iii) `None` (in which
case the resulting string will be returned).
:param positions: list of site positions, with length matching
the alignment length. Positions are required to be in the [0,1]
range but the order is not checked. By default (if the argument
value is ``None``), sites are supposed to be evenly spread over
the [0,1] interval.
:param alphabet: an :class:`.Alphabet` instance to use
instead of the alphabet attached to the input alignment. When
using this argument, alleles are transformed using the alphabet
provided as argument using a direct mapping of the input alignment
alphabet to the provided alphabet.
:param as_codes: use the internal alphabet codes for
exporting. Useful if a non-int alphabet is used. Ignored if
*converter* is used. Not allowed if *alphabet* is used.
:param converter: a user-provided function generating the appropriate
allelic value based on an input alphabet code. The simple inline
example below shows how to convert any missing data to 0 and shift all
valid alleles of one unit: ``converter=lambda x: 0 if x<0 else x+1``.
:param spacer: insert a space between all genotypes
(to separate each locus/site). By default, all genotypes are
concatenated as in the standard ``ms`` format.
In this case, it is required that all allelic
values (or codes if *as_codes* is specified) are >=0 and <10. By
default (``"auto"``), the spacer is automatically inserted if
the alphabet defines alleles or codes outside the [0, 9] range,
or if *converter* is specified.
:param header: string to print instead of the two slashes line.
Must contain the final new line.
:return: A ``ms``-formatted string if *outfile* is ``None``,
otherwise ``None``.
"""
# error checking
if not isinstance(aln, _interface.Align): raise TypeError('an Align instance is expected')
ls = aln.ls
if positions is not None:
if len(positions) != ls: raise ValueError('invalid number of items in the positions list')
if min(positions) < 0.0 or max(positions) > 1.0: raise ValueError('out of range item in the positions list')
if alphabet is None: alph = aln.alphabet
else:
if as_codes:
raise ValueError('options `as_codes` and `alphabet` are incompatible')
alph = alphabet
if alph.type not in ['int', 'range'] and converter is None and as_codes == False: raise ValueError('unsupported alphabet type: {0}'.format(alph.type))
if converter is not None and as_codes:
raise ValueError('options `as_codes` and `converter` are incompatible')
# manage output file
if outfile is None:
out = StringIO()
elif isinstance(outfile, str): out = open(outfile, 'w')
else: out = outfile
out.write(header)
# shows number of sites and site
out.write('segsites: {0}\n'.format(ls))
if positions is None:
if ls == 0: positions = []
elif ls == 1: positions = [0.5]
else: positions = [i / (ls-1) for i in range(ls)]
if ls > 0:
out.write('positions: {0}\n'.format(' '.join(['{0:.5f}'.format(i) for i in positions])))
else:
out.write('\n')
# spacer
if spacer == 'auto':
if converter is not None: spacer = True
else:
if as_codes:
if alph._obj.num_missing() > 0 or alph._obj.num_exploitable() > 10: spacer = True
else: spacer = False
else:
if alph.type == 'range':
if alph._obj.min_value() < 0 or alph._obj.max_value() > 9: spacer = True
else: spacer = False
else:
exp, mis = alph.get_alleles()
alls = exp + mis
if min(alls) < 0 or max(alls) > 9: spacer = True
else: spacer = False
# export data
if converter is not None: f = converter
elif as_codes: f = lambda x: x
else: f = alph.get_value
for i in range(aln.ns):
values = tuple(map(f, [aln._obj.get_sample(i, j) for j in range(ls)]))
if not spacer and (min(values) < 0 or max(values) > 9): raise ValueError('not printable allele for ms format (you should insert a spacer)')
if spacer: out.write(' '.join(map(str, values)) + '\n')
else: out.write(''.join(map(str, values)) + '\n')
# close file or return string as necessary
out.write('\n')
if isinstance(outfile, str): out.close()
elif outfile is None: return out.getvalue()