Source code for egglib.io._genepop

"""
    Copyright 2023 Stéphane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import os, re
from .. import _interface, alphabets

[docs]def from_genepop(fname): """ Import Genepop-formatted genotypic data. The format is described `here <https://genepop.curtin.edu.au/help_input.html>`_. :param fname: Genepop-formatted file name. :return: A new :class:`.Align` instance. The returned object contains data mapped to an *ad hoc* alphabet, with two samples per individuals. Group labels are used to indicate the structure (first level: populations, second level: individuals). In addition to normal :class:`.Align` instance, the returned object has two attributes: :attr:`!title` and :attr:`!loci`, which contain the information read from the Genepop file. """ with open(fname) as f: # read title title = f.readline().strip() if len(title) == 0: raise ValueError('title cannot be empty') # read locus names loci = [] for line in f: if line == '': raise ValueError('unexpected end of line') line = line.strip() if line in {'POP', 'Pop', 'pop'}: break loci.extend(map(str.strip, re.split(', ?', line))) # initialise data res = _interface.Align(alphabets.genepop) # read pops pop_idx = 0 while True: for line in f: line = line.strip() if line in {'POP', 'Pop', 'pop'}: pop_idx += 1 break # read indiv hit = re.match('(.+),(.+)', line) if hit is None: raise ValueError('invalid line: {0}'.format(line)) name, genos = hit.groups() name = name.strip() genos = list(map(str.strip, genos.split())) if len(genos) != len(loci): raise ValueError('inconsistent number of loci') # read genotypes and import lens = set(map(len, genos)) if lens == {4}: res.add_sample(name+'_1', [int(geno[:2]) for geno in genos], ['pop{0}'.format(pop_idx+1), name]) res.add_sample(name+'_2', [int(geno[2:]) for geno in genos], ['pop{0}'.format(pop_idx+1), name]) elif lens == {6}: res.add_sample(name+'_1', [int(geno[:3]) for geno in genos], ['pop{0}'.format(pop_idx+1), name]) res.add_sample(name+'_2', [int(geno[3:]) for geno in genos], ['pop{0}'.format(pop_idx+1), name]) else: raise ValueError('invalid number of characters in genotype') else: break # return alignment res.title = title res.loci = loci return res