aegis_sim.recording.popgenstatsrecorder
1import numpy as np 2 3from aegis_sim.utilities.funcs import skip 4 5 6from .recorder import Recorder 7from aegis_sim import submodels 8 9 10class PopgenStatsRecorder(Recorder): 11 def __init__(self, odir): 12 self.odir = odir / "popgen" 13 self.init_odir() 14 15 def write(self, genomes, mutation_rates): 16 """ 17 Record population size in popgenstats, and record popgen statistics 18 19 # OUTPUT SPECIFICATION 20 path: /popgen/simple.csv 21 filetype: csv 22 category: population genetics 23 description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h. 24 trait granularity: 25 time granularity: 26 frequency parameter: POPGENSTATS_RATE 27 structure: A float matrix. 28 29 # OUTPUT SPECIFICATION 30 path: /popgen/allele_frequencies.csv 31 filetype: csv 32 category: population genetics 33 description: 1-allele population-frequencies of every genomic site. 34 trait granularity: 35 time granularity: 36 frequency parameter: POPGENSTATS_RATE 37 structure: A float matrix. 38 39 # OUTPUT SPECIFICATION 40 path: /popgen/genotype_frequencies.csv 41 filetype: csv 42 category: population genetics 43 description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site). 44 trait granularity: 45 time granularity: 46 frequency parameter: POPGENSTATS_RATE 47 structure: A float matrix. 48 49 # OUTPUT SPECIFICATION 50 path: /popgen/sfs.csv 51 filetype: csv 52 category: population genetics 53 description: A site frequency spectrum. 54 trait granularity: 55 time granularity: 56 frequency parameter: POPGENSTATS_RATE 57 structure: A float matrix. 58 59 # OUTPUT SPECIFICATION 60 path: /popgen/mean_h_per_bit_expected.csv 61 filetype: csv 62 category: population genetics 63 description: Heterozygosity per bit. 64 trait granularity: 65 time granularity: 66 frequency parameter: POPGENSTATS_RATE 67 structure: A float matrix. 68 69 # OUTPUT SPECIFICATION 70 path: /popgen/mean_h_per_bit.csv 71 filetype: csv 72 category: population genetics 73 description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 74 trait granularity: 75 time granularity: 76 frequency parameter: POPGENSTATS_RATE 77 structure: A float matrix. 78 79 # OUTPUT SPECIFICATION 80 path: /popgen/mean_h_per_locus.csv 81 filetype: csv 82 category: population genetics 83 description: Mean bit heterozygosity per locus. 84 trait granularity: 85 time granularity: 86 frequency parameter: POPGENSTATS_RATE 87 structure: A float matrix. 88 89 # OUTPUT SPECIFICATION 90 path: /popgen/reference_genome.csv 91 filetype: csv 92 category: population genetics 93 description: Reference genome based on which allele is most common at each position. 94 trait granularity: 95 time granularity: 96 frequency parameter: POPGENSTATS_RATE 97 structure: A float matrix. 98 99 # OUTPUT SPECIFICATION 100 path: /popgen/reference_genome_gsample.csv 101 filetype: csv 102 category: population genetics 103 description: Reference genome based on which allele is most common at each position in a sample of genomes. 104 trait granularity: 105 time granularity: 106 frequency parameter: POPGENSTATS_RATE 107 structure: A float matrix. 108 """ 109 submodels.popgenstats.record_pop_size_history(genomes.array) 110 111 if skip("POPGENSTATS_RATE") or len(genomes) == 0: 112 return 113 114 submodels.popgenstats.calc(genomes.array, mutation_rates) 115 116 # Record simple statistics 117 array = list(submodels.popgenstats.emit_simple().values()) 118 if None in array: 119 return 120 121 with open(self.odir / "simple.csv", "ab") as f: 122 np.savetxt(f, [array], delimiter=",", fmt="%1.3e") 123 124 # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case) 125 126 # Record complex statistics 127 complex_statistics = submodels.popgenstats.emit_complex() 128 for key, array in complex_statistics.items(): 129 with open(self.odir / f"{key}.csv", "ab") as f: 130 np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
11class PopgenStatsRecorder(Recorder): 12 def __init__(self, odir): 13 self.odir = odir / "popgen" 14 self.init_odir() 15 16 def write(self, genomes, mutation_rates): 17 """ 18 Record population size in popgenstats, and record popgen statistics 19 20 # OUTPUT SPECIFICATION 21 path: /popgen/simple.csv 22 filetype: csv 23 category: population genetics 24 description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h. 25 trait granularity: 26 time granularity: 27 frequency parameter: POPGENSTATS_RATE 28 structure: A float matrix. 29 30 # OUTPUT SPECIFICATION 31 path: /popgen/allele_frequencies.csv 32 filetype: csv 33 category: population genetics 34 description: 1-allele population-frequencies of every genomic site. 35 trait granularity: 36 time granularity: 37 frequency parameter: POPGENSTATS_RATE 38 structure: A float matrix. 39 40 # OUTPUT SPECIFICATION 41 path: /popgen/genotype_frequencies.csv 42 filetype: csv 43 category: population genetics 44 description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site). 45 trait granularity: 46 time granularity: 47 frequency parameter: POPGENSTATS_RATE 48 structure: A float matrix. 49 50 # OUTPUT SPECIFICATION 51 path: /popgen/sfs.csv 52 filetype: csv 53 category: population genetics 54 description: A site frequency spectrum. 55 trait granularity: 56 time granularity: 57 frequency parameter: POPGENSTATS_RATE 58 structure: A float matrix. 59 60 # OUTPUT SPECIFICATION 61 path: /popgen/mean_h_per_bit_expected.csv 62 filetype: csv 63 category: population genetics 64 description: Heterozygosity per bit. 65 trait granularity: 66 time granularity: 67 frequency parameter: POPGENSTATS_RATE 68 structure: A float matrix. 69 70 # OUTPUT SPECIFICATION 71 path: /popgen/mean_h_per_bit.csv 72 filetype: csv 73 category: population genetics 74 description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 75 trait granularity: 76 time granularity: 77 frequency parameter: POPGENSTATS_RATE 78 structure: A float matrix. 79 80 # OUTPUT SPECIFICATION 81 path: /popgen/mean_h_per_locus.csv 82 filetype: csv 83 category: population genetics 84 description: Mean bit heterozygosity per locus. 85 trait granularity: 86 time granularity: 87 frequency parameter: POPGENSTATS_RATE 88 structure: A float matrix. 89 90 # OUTPUT SPECIFICATION 91 path: /popgen/reference_genome.csv 92 filetype: csv 93 category: population genetics 94 description: Reference genome based on which allele is most common at each position. 95 trait granularity: 96 time granularity: 97 frequency parameter: POPGENSTATS_RATE 98 structure: A float matrix. 99 100 # OUTPUT SPECIFICATION 101 path: /popgen/reference_genome_gsample.csv 102 filetype: csv 103 category: population genetics 104 description: Reference genome based on which allele is most common at each position in a sample of genomes. 105 trait granularity: 106 time granularity: 107 frequency parameter: POPGENSTATS_RATE 108 structure: A float matrix. 109 """ 110 submodels.popgenstats.record_pop_size_history(genomes.array) 111 112 if skip("POPGENSTATS_RATE") or len(genomes) == 0: 113 return 114 115 submodels.popgenstats.calc(genomes.array, mutation_rates) 116 117 # Record simple statistics 118 array = list(submodels.popgenstats.emit_simple().values()) 119 if None in array: 120 return 121 122 with open(self.odir / "simple.csv", "ab") as f: 123 np.savetxt(f, [array], delimiter=",", fmt="%1.3e") 124 125 # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case) 126 127 # Record complex statistics 128 complex_statistics = submodels.popgenstats.emit_complex() 129 for key, array in complex_statistics.items(): 130 with open(self.odir / f"{key}.csv", "ab") as f: 131 np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
16 def write(self, genomes, mutation_rates): 17 """ 18 Record population size in popgenstats, and record popgen statistics 19 20 # OUTPUT SPECIFICATION 21 path: /popgen/simple.csv 22 filetype: csv 23 category: population genetics 24 description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h. 25 trait granularity: 26 time granularity: 27 frequency parameter: POPGENSTATS_RATE 28 structure: A float matrix. 29 30 # OUTPUT SPECIFICATION 31 path: /popgen/allele_frequencies.csv 32 filetype: csv 33 category: population genetics 34 description: 1-allele population-frequencies of every genomic site. 35 trait granularity: 36 time granularity: 37 frequency parameter: POPGENSTATS_RATE 38 structure: A float matrix. 39 40 # OUTPUT SPECIFICATION 41 path: /popgen/genotype_frequencies.csv 42 filetype: csv 43 category: population genetics 44 description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site). 45 trait granularity: 46 time granularity: 47 frequency parameter: POPGENSTATS_RATE 48 structure: A float matrix. 49 50 # OUTPUT SPECIFICATION 51 path: /popgen/sfs.csv 52 filetype: csv 53 category: population genetics 54 description: A site frequency spectrum. 55 trait granularity: 56 time granularity: 57 frequency parameter: POPGENSTATS_RATE 58 structure: A float matrix. 59 60 # OUTPUT SPECIFICATION 61 path: /popgen/mean_h_per_bit_expected.csv 62 filetype: csv 63 category: population genetics 64 description: Heterozygosity per bit. 65 trait granularity: 66 time granularity: 67 frequency parameter: POPGENSTATS_RATE 68 structure: A float matrix. 69 70 # OUTPUT SPECIFICATION 71 path: /popgen/mean_h_per_bit.csv 72 filetype: csv 73 category: population genetics 74 description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 75 trait granularity: 76 time granularity: 77 frequency parameter: POPGENSTATS_RATE 78 structure: A float matrix. 79 80 # OUTPUT SPECIFICATION 81 path: /popgen/mean_h_per_locus.csv 82 filetype: csv 83 category: population genetics 84 description: Mean bit heterozygosity per locus. 85 trait granularity: 86 time granularity: 87 frequency parameter: POPGENSTATS_RATE 88 structure: A float matrix. 89 90 # OUTPUT SPECIFICATION 91 path: /popgen/reference_genome.csv 92 filetype: csv 93 category: population genetics 94 description: Reference genome based on which allele is most common at each position. 95 trait granularity: 96 time granularity: 97 frequency parameter: POPGENSTATS_RATE 98 structure: A float matrix. 99 100 # OUTPUT SPECIFICATION 101 path: /popgen/reference_genome_gsample.csv 102 filetype: csv 103 category: population genetics 104 description: Reference genome based on which allele is most common at each position in a sample of genomes. 105 trait granularity: 106 time granularity: 107 frequency parameter: POPGENSTATS_RATE 108 structure: A float matrix. 109 """ 110 submodels.popgenstats.record_pop_size_history(genomes.array) 111 112 if skip("POPGENSTATS_RATE") or len(genomes) == 0: 113 return 114 115 submodels.popgenstats.calc(genomes.array, mutation_rates) 116 117 # Record simple statistics 118 array = list(submodels.popgenstats.emit_simple().values()) 119 if None in array: 120 return 121 122 with open(self.odir / "simple.csv", "ab") as f: 123 np.savetxt(f, [array], delimiter=",", fmt="%1.3e") 124 125 # TODO when writing some metrics (e.g. reference genome, reference genome gsample) use the appropriate dtype (bool in that case) 126 127 # Record complex statistics 128 complex_statistics = submodels.popgenstats.emit_complex() 129 for key, array in complex_statistics.items(): 130 with open(self.odir / f"{key}.csv", "ab") as f: 131 np.savetxt(f, [array], delimiter=",", fmt="%1.3e")
Record population size in popgenstats, and record popgen statistics
OUTPUT SPECIFICATION
path: /popgen/simple.csv filetype: csv category: population genetics description: Simple population metrics including population size, effective population size, mu, segregating sites, segregating sites using a genomic sample, theta, theta_w, theta_pi, tajimas_d, theta_h, and fayandwu_h. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/allele_frequencies.csv filetype: csv category: population genetics description: 1-allele population-frequencies of every genomic site. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/genotype_frequencies.csv filetype: csv category: population genetics description: Genotype frequencies at site resolution (e.g. for a diploid genome, number of 00, 01 and 11 for each site). trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/sfs.csv filetype: csv category: population genetics description: A site frequency spectrum. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/mean_h_per_bit_expected.csv filetype: csv category: population genetics description: Heterozygosity per bit. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/mean_h_per_bit.csv filetype: csv category: population genetics description: Expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/mean_h_per_locus.csv filetype: csv category: population genetics description: Mean bit heterozygosity per locus. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/reference_genome.csv filetype: csv category: population genetics description: Reference genome based on which allele is most common at each position. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.
OUTPUT SPECIFICATION
path: /popgen/reference_genome_gsample.csv filetype: csv category: population genetics description: Reference genome based on which allele is most common at each position in a sample of genomes. trait granularity: time granularity: frequency parameter: POPGENSTATS_RATE structure: A float matrix.