aegis_sim.recording.popgenstatsrecorder

  1import numpy as np
  2
  3from aegis_sim.utilities.funcs import skip
  4
  5
  6from .recorder import Recorder
  7from aegis_sim import submodels
  8
  9
 10class PopgenStatsRecorder(Recorder):
 11    def __init__(self, odir):
 12        self.odir = odir / "popgen"
 13        self.init_odir()
 14
 15    def write(self, genomes, mutation_rates):
 16        """
 17        Record population size in popgenstats, and record popgen statistics
 18
 19        # OUTPUT SPECIFICATION
 20        path: /popgen/simple.csv
 21        filetype: csv
 22        category: population genetics
 23        description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h.
 24        trait granularity:
 25        time granularity:
 26        frequency parameter: POPGENSTATS_RATE
 27        structure: A float matrix.
 28
 29        # OUTPUT SPECIFICATION
 30        path: /popgen/allele_frequencies.csv
 31        filetype: csv
 32        category: population genetics
 33        description: 1-allele population-frequencies of every genomic site.
 34        trait granularity:
 35        time granularity:
 36        frequency parameter: POPGENSTATS_RATE
 37        structure: A float matrix.
 38
 39        # OUTPUT SPECIFICATION
 40        path: /popgen/genotype_frequencies.csv
 41        filetype: csv
 42        category: population genetics
 43        description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site).
 44        trait granularity:
 45        time granularity:
 46        frequency parameter: POPGENSTATS_RATE
 47        structure: A float matrix.
 48
 49        # OUTPUT SPECIFICATION
 50        path: /popgen/sfs.csv
 51        filetype: csv
 52        category: population genetics
 53        description: A site frequency spectrum.
 54        trait granularity:
 55        time granularity:
 56        frequency parameter: POPGENSTATS_RATE
 57        structure: A float matrix.
 58
 59        # OUTPUT SPECIFICATION
 60        path: /popgen/mean_h_per_bit_expected.csv
 61        filetype: csv
 62        category: population genetics
 63        description: Heterozygosity per bit.
 64        trait granularity:
 65        time granularity:
 66        frequency parameter: POPGENSTATS_RATE
 67        structure: A float matrix.
 68
 69        # OUTPUT SPECIFICATION
 70        path: /popgen/mean_h_per_bit.csv
 71        filetype: csv
 72        category: population genetics
 73        description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
 74        trait granularity:
 75        time granularity:
 76        frequency parameter: POPGENSTATS_RATE
 77        structure: A float matrix.
 78
 79        # OUTPUT SPECIFICATION
 80        path: /popgen/mean_h_per_locus.csv
 81        filetype: csv
 82        category: population genetics
 83        description: Mean bit heterozygosity per locus.
 84        trait granularity:
 85        time granularity:
 86        frequency parameter: POPGENSTATS_RATE
 87        structure: A float matrix.
 88
 89        # OUTPUT SPECIFICATION
 90        path: /popgen/reference_genome.csv
 91        filetype: csv
 92        category: population genetics
 93        description: Reference genome based on which allele is most common at each position.
 94        trait granularity:
 95        time granularity:
 96        frequency parameter: POPGENSTATS_RATE
 97        structure: A float matrix.
 98
 99        # OUTPUT SPECIFICATION
100        path: /popgen/reference_genome_gsample.csv
101        filetype: csv
102        category: population genetics
103        description: Reference genome based on which allele is most common at each position in a sample of genomes.
104        trait granularity:
105        time granularity:
106        frequency parameter: POPGENSTATS_RATE
107        structure: A float matrix.
108        """
109        submodels.popgenstats.record_pop_size_history(genomes.array)
110
111        if skip("POPGENSTATS_RATE") or len(genomes) == 0:
112            return
113
114        submodels.popgenstats.calc(genomes.array, mutation_rates)
115
116        # Record simple statistics
117        array = list(submodels.popgenstats.emit_simple().values())
118        if None in array:
119            return
120
121        with open(self.odir / "simple.csv", "ab") as f:
122            np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
123
124        # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case)
125
126        # Record complex statistics
127        complex_statistics = submodels.popgenstats.emit_complex()
128        for key, array in complex_statistics.items():
129            with open(self.odir / f"{key}.csv", "ab") as f:
130                np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
class PopgenStatsRecorder(aegis_sim.recording.recorder.Recorder):
 11class PopgenStatsRecorder(Recorder):
 12    def __init__(self, odir):
 13        self.odir = odir / "popgen"
 14        self.init_odir()
 15
 16    def write(self, genomes, mutation_rates):
 17        """
 18        Record population size in popgenstats, and record popgen statistics
 19
 20        # OUTPUT SPECIFICATION
 21        path: /popgen/simple.csv
 22        filetype: csv
 23        category: population genetics
 24        description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h.
 25        trait granularity:
 26        time granularity:
 27        frequency parameter: POPGENSTATS_RATE
 28        structure: A float matrix.
 29
 30        # OUTPUT SPECIFICATION
 31        path: /popgen/allele_frequencies.csv
 32        filetype: csv
 33        category: population genetics
 34        description: 1-allele population-frequencies of every genomic site.
 35        trait granularity:
 36        time granularity:
 37        frequency parameter: POPGENSTATS_RATE
 38        structure: A float matrix.
 39
 40        # OUTPUT SPECIFICATION
 41        path: /popgen/genotype_frequencies.csv
 42        filetype: csv
 43        category: population genetics
 44        description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site).
 45        trait granularity:
 46        time granularity:
 47        frequency parameter: POPGENSTATS_RATE
 48        structure: A float matrix.
 49
 50        # OUTPUT SPECIFICATION
 51        path: /popgen/sfs.csv
 52        filetype: csv
 53        category: population genetics
 54        description: A site frequency spectrum.
 55        trait granularity:
 56        time granularity:
 57        frequency parameter: POPGENSTATS_RATE
 58        structure: A float matrix.
 59
 60        # OUTPUT SPECIFICATION
 61        path: /popgen/mean_h_per_bit_expected.csv
 62        filetype: csv
 63        category: population genetics
 64        description: Heterozygosity per bit.
 65        trait granularity:
 66        time granularity:
 67        frequency parameter: POPGENSTATS_RATE
 68        structure: A float matrix.
 69
 70        # OUTPUT SPECIFICATION
 71        path: /popgen/mean_h_per_bit.csv
 72        filetype: csv
 73        category: population genetics
 74        description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
 75        trait granularity:
 76        time granularity:
 77        frequency parameter: POPGENSTATS_RATE
 78        structure: A float matrix.
 79
 80        # OUTPUT SPECIFICATION
 81        path: /popgen/mean_h_per_locus.csv
 82        filetype: csv
 83        category: population genetics
 84        description: Mean bit heterozygosity per locus.
 85        trait granularity:
 86        time granularity:
 87        frequency parameter: POPGENSTATS_RATE
 88        structure: A float matrix.
 89
 90        # OUTPUT SPECIFICATION
 91        path: /popgen/reference_genome.csv
 92        filetype: csv
 93        category: population genetics
 94        description: Reference genome based on which allele is most common at each position.
 95        trait granularity:
 96        time granularity:
 97        frequency parameter: POPGENSTATS_RATE
 98        structure: A float matrix.
 99
100        # OUTPUT SPECIFICATION
101        path: /popgen/reference_genome_gsample.csv
102        filetype: csv
103        category: population genetics
104        description: Reference genome based on which allele is most common at each position in a sample of genomes.
105        trait granularity:
106        time granularity:
107        frequency parameter: POPGENSTATS_RATE
108        structure: A float matrix.
109        """
110        submodels.popgenstats.record_pop_size_history(genomes.array)
111
112        if skip("POPGENSTATS_RATE") or len(genomes) == 0:
113            return
114
115        submodels.popgenstats.calc(genomes.array, mutation_rates)
116
117        # Record simple statistics
118        array = list(submodels.popgenstats.emit_simple().values())
119        if None in array:
120            return
121
122        with open(self.odir / "simple.csv", "ab") as f:
123            np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
124
125        # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case)
126
127        # Record complex statistics
128        complex_statistics = submodels.popgenstats.emit_complex()
129        for key, array in complex_statistics.items():
130            with open(self.odir / f"{key}.csv", "ab") as f:
131                np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
PopgenStatsRecorder(odir)
12    def __init__(self, odir):
13        self.odir = odir / "popgen"
14        self.init_odir()
odir
def write(self, genomes, mutation_rates):
 16    def write(self, genomes, mutation_rates):
 17        """
 18        Record population size in popgenstats, and record popgen statistics
 19
 20        # OUTPUT SPECIFICATION
 21        path: /popgen/simple.csv
 22        filetype: csv
 23        category: population genetics
 24        description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h.
 25        trait granularity:
 26        time granularity:
 27        frequency parameter: POPGENSTATS_RATE
 28        structure: A float matrix.
 29
 30        # OUTPUT SPECIFICATION
 31        path: /popgen/allele_frequencies.csv
 32        filetype: csv
 33        category: population genetics
 34        description: 1-allele population-frequencies of every genomic site.
 35        trait granularity:
 36        time granularity:
 37        frequency parameter: POPGENSTATS_RATE
 38        structure: A float matrix.
 39
 40        # OUTPUT SPECIFICATION
 41        path: /popgen/genotype_frequencies.csv
 42        filetype: csv
 43        category: population genetics
 44        description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site).
 45        trait granularity:
 46        time granularity:
 47        frequency parameter: POPGENSTATS_RATE
 48        structure: A float matrix.
 49
 50        # OUTPUT SPECIFICATION
 51        path: /popgen/sfs.csv
 52        filetype: csv
 53        category: population genetics
 54        description: A site frequency spectrum.
 55        trait granularity:
 56        time granularity:
 57        frequency parameter: POPGENSTATS_RATE
 58        structure: A float matrix.
 59
 60        # OUTPUT SPECIFICATION
 61        path: /popgen/mean_h_per_bit_expected.csv
 62        filetype: csv
 63        category: population genetics
 64        description: Heterozygosity per bit.
 65        trait granularity:
 66        time granularity:
 67        frequency parameter: POPGENSTATS_RATE
 68        structure: A float matrix.
 69
 70        # OUTPUT SPECIFICATION
 71        path: /popgen/mean_h_per_bit.csv
 72        filetype: csv
 73        category: population genetics
 74        description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
 75        trait granularity:
 76        time granularity:
 77        frequency parameter: POPGENSTATS_RATE
 78        structure: A float matrix.
 79
 80        # OUTPUT SPECIFICATION
 81        path: /popgen/mean_h_per_locus.csv
 82        filetype: csv
 83        category: population genetics
 84        description: Mean bit heterozygosity per locus.
 85        trait granularity:
 86        time granularity:
 87        frequency parameter: POPGENSTATS_RATE
 88        structure: A float matrix.
 89
 90        # OUTPUT SPECIFICATION
 91        path: /popgen/reference_genome.csv
 92        filetype: csv
 93        category: population genetics
 94        description: Reference genome based on which allele is most common at each position.
 95        trait granularity:
 96        time granularity:
 97        frequency parameter: POPGENSTATS_RATE
 98        structure: A float matrix.
 99
100        # OUTPUT SPECIFICATION
101        path: /popgen/reference_genome_gsample.csv
102        filetype: csv
103        category: population genetics
104        description: Reference genome based on which allele is most common at each position in a sample of genomes.
105        trait granularity:
106        time granularity:
107        frequency parameter: POPGENSTATS_RATE
108        structure: A float matrix.
109        """
110        submodels.popgenstats.record_pop_size_history(genomes.array)
111
112        if skip("POPGENSTATS_RATE") or len(genomes) == 0:
113            return
114
115        submodels.popgenstats.calc(genomes.array, mutation_rates)
116
117        # Record simple statistics
118        array = list(submodels.popgenstats.emit_simple().values())
119        if None in array:
120            return
121
122        with open(self.odir / "simple.csv", "ab") as f:
123            np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
124
125        # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case)
126
127        # Record complex statistics
128        complex_statistics = submodels.popgenstats.emit_complex()
129        for key, array in complex_statistics.items():
130            with open(self.odir / f"{key}.csv", "ab") as f:
131                np.savetxt(f, [array], delimiter=",", fmt="%1.3e")

Record population size in popgenstats, and record popgen statistics

OUTPUT SPECIFICATION

path: /popgen/simple.csv filetype: csv category: population genetics description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/allele_frequencies.csv filetype: csv category: population genetics description: 1-allele population-frequencies of every genomic site. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/genotype_frequencies.csv filetype: csv category: population genetics description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site). trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/sfs.csv filetype: csv category: population genetics description: A site frequency spectrum. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/mean_h_per_bit_expected.csv filetype: csv category: population genetics description: Heterozygosity per bit. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/mean_h_per_bit.csv filetype: csv category: population genetics description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/mean_h_per_locus.csv filetype: csv category: population genetics description: Mean bit heterozygosity per locus. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/reference_genome.csv filetype: csv category: population genetics description: Reference genome based on which allele is most common at each position. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.

OUTPUT SPECIFICATION

path: /popgen/reference_genome_gsample.csv filetype: csv category: population genetics description: Reference genome based on which allele is most common at each position in a sample of genomes. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.