Source code for egglib.io._genepop

"""
    Copyright 2023 Stéphane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import os, re
from .. import _interface, alphabets


[docs]
def from_genepop(fname):
    """
    Import Genepop-formatted genotypic data. The format is described
    `here <https://genepop.curtin.edu.au/help_input.html>`_.

    :param fname: Genepop-formatted file name.

    :return: A new :class:`.Align` instance.

    The returned object contains data mapped to an *ad hoc* alphabet,
    with two samples per individuals. Group labels are used to indicate
    the structure (first level: populations, second level: individuals).
    In addition to normal :class:`.Align` instance, the returned object
    has two attributes: :attr:`!title` and :attr:`!loci`, which contain
    the information read from the Genepop file.
    """

    with open(fname) as f:
        # read title
        title = f.readline().strip()
        if len(title) == 0: raise ValueError('title cannot be empty')

        # read locus names
        loci = []
        for line in f:
            if line == '': raise ValueError('unexpected end of line')
            line = line.strip()
            if line in {'POP', 'Pop', 'pop'}: break
            loci.extend(map(str.strip, re.split(', ?', line)))

        # initialise data
        res = _interface.Align(alphabets.genepop)

        # read pops
        pop_idx = 0
        while True:
            for line in f:
                line = line.strip()
                if line in {'POP', 'Pop', 'pop'}:
                    pop_idx += 1
                    break

                # read indiv
                hit = re.match('(.+),(.+)', line)
                if hit is None: raise ValueError('invalid line: {0}'.format(line))
                name, genos = hit.groups()
                name = name.strip()
                genos = list(map(str.strip, genos.split()))
                if len(genos) != len(loci): raise ValueError('inconsistent number of loci')

                # read genotypes and import
                lens = set(map(len, genos))
                if lens == {4}:
                    res.add_sample(name+'_1', [int(geno[:2]) for geno in genos], ['pop{0}'.format(pop_idx+1), name])
                    res.add_sample(name+'_2', [int(geno[2:]) for geno in genos], ['pop{0}'.format(pop_idx+1), name])
                elif lens == {6}:
                    res.add_sample(name+'_1', [int(geno[:3]) for geno in genos], ['pop{0}'.format(pop_idx+1), name])
                    res.add_sample(name+'_2', [int(geno[3:]) for geno in genos], ['pop{0}'.format(pop_idx+1), name])
                else:
                    raise ValueError('invalid number of characters in genotype')
            else:
                break

        # return alignment
        res.title = title
        res.loci = loci
        return res