aegis_sim.utilities.popgenstats
Contains functions for the computation of relevant population genetic metrics
1"""Contains functions for the computation of relevant population genetic metrics""" 2 3import statistics 4import itertools 5import logging 6import numpy as np 7 8from aegis_sim.parameterization import parametermanager 9 10class PopgenStats: 11 def __init__(self): 12 self.pop_size_history = [] 13 14 def record_pop_size_history(self, genomes): 15 """Records population sizes at last 1000 steps""" 16 if len(self.pop_size_history) >= 1000: 17 del self.pop_size_history[0] 18 self.pop_size_history.append(len(genomes)) 19 20 def calc(self, input_genomes, mutation_rates): 21 """Calculates all popgen metrics 22 23 Set sample_size value to 0 or -1 to not perform any sampling. 24 """ 25 26 # Infer ploidy from genomes 27 self.ploidy = input_genomes.shape[1] 28 29 # Data to process 30 self.genomes = self.make_3D(input_genomes) 31 self.gsample = self.get_genomes_sample() 32 self.nsample = 0 if self.gsample is None else len(self.gsample) # TODO correct for ploidy? 33 34 # Statistics on population 35 self.n = self.get_n() 36 self.ne = self.get_ne() 37 self.allele_frequencies = self.get_allele_frequencies() 38 self.genotype_frequencies = self.get_genotype_frequencies() 39 self.mean_h_per_bit = self.get_mean_h_per_bit() 40 self.mean_h_per_locus = self.get_mean_h_per_locus() 41 self.mean_h = self.get_mean_h() 42 self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected() 43 self.mean_h_expected = self.get_mean_h_expected() 44 self.mu = self.get_mu(mutation_rates) 45 self.theta = self.get_theta() 46 self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy) 47 self.reference_genome = self.get_reference_genome(self.genomes) 48 49 # Statistics on sample 50 if self.nsample: 51 # TODO gsample is essentially haploid, check if that causes issues 52 self.reference_genome_gsample = self.get_reference_genome(self.gsample) 53 self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1) 54 self.theta_w = self.get_theta_w() 55 self.theta_pi = self.get_theta_pi() 56 self.tajimas_d = self.get_tajimas_d() 57 self.sfs = self.get_sfs(self.reference_genome_gsample) # Uses reference genome calculated from sample 58 self.theta_h = self.get_theta_h() 59 self.fayandwu_h = self.get_fayandwu_h() 60 else: 61 # TODO will cause Recorder to fail? 62 ( 63 self.reference_genome_gsample, 64 self.segregating_sites_gsample, 65 self.theta_w, 66 self.theta_pi, 67 self.tajimas_d, 68 self.sfs, 69 self.theta_h, 70 self.fayandwu_h, 71 ) = [None] * 8 72 73 def emit_simple(self): 74 # TODO add headers 75 # TODO add headers and indices to all output files; and index and column names 76 77 attrs = [ 78 "n", 79 "ne", 80 "mu", 81 "segregating_sites", 82 "segregating_sites_gsample", 83 "theta", 84 "theta_w", 85 "theta_pi", 86 "tajimas_d", 87 "theta_h", 88 "fayandwu_h", 89 ] 90 91 if self.ploidy == 2: 92 attrs += ["mean_h", "mean_h_expected"] 93 94 return {attr: getattr(self, attr) for attr in attrs} 95 96 def emit_complex(self): 97 attrs = [ 98 "allele_frequencies", 99 "genotype_frequencies", 100 "sfs", 101 "reference_genome", 102 "reference_genome_gsample", 103 ] 104 105 if self.ploidy == 2: 106 attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"] 107 108 return {attr: getattr(self, attr) for attr in attrs} 109 110 #################### 111 # HELPER FUNCTIONS # 112 #################### 113 114 @staticmethod 115 def harmonic(i): 116 """Returns the i-th harmonic number""" 117 return (1 / np.arange(1, i + 1)).sum() 118 119 @staticmethod 120 def harmonic_sq(i): 121 """Returns the i-th harmonic square""" 122 return (1 / np.arange(1, i + 1) ** 2).sum() 123 124 @staticmethod 125 def make_3D(input_genomes): 126 """Returns genomes array with merged chromosomes 127 128 Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, 129 the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. 130 If individuals are haploid, all bits belong to one virtual chromosome. 131 132 This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] 133 so that all bits from one chromosome are in a separate array than those in the second chromosome. 134 If individuals are haploid, the chromosome dimension contains only one element. 135 """ 136 137 n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape 138 139 # TODO report when conversion cannot be executed 140 141 if ploidy == 1: 142 return input_genomes[:, 0] 143 144 else: 145 genomes = np.empty( 146 shape=(n_individuals, n_loci, ploidy * bits_per_locus), 147 dtype=np.bool_, 148 ) 149 150 # Odd bits contain bits from chromosome 0 151 genomes[:, :, 0::2] = input_genomes[:, 0] 152 153 # Even bits contain bits from chromosome 1 154 genomes[:, :, 1::2] = input_genomes[:, 1] 155 156 return genomes 157 158 @staticmethod 159 def make_4D(genomes, ploidy): 160 """Returns genomes array with chromosomes split along the second dimension""" 161 162 n_individuals, n_loci, n_bits = genomes.shape 163 bits_per_locus = n_bits // ploidy 164 165 # TODO report when conversion cannot be executed 166 167 unstaggered = np.empty( 168 shape=(n_individuals, ploidy, n_loci, bits_per_locus), 169 dtype=np.bool_, 170 ) 171 172 if ploidy == 1: 173 unstaggered[:, 0] = genomes 174 if ploidy == 2: 175 # Chromosome 0 contains odd bits from input genomes 176 unstaggered[:, 0] = genomes[:, :, 0::2] 177 # Chromosome 1 contains even bits from input genomes 178 unstaggered[:, 1] = genomes[:, :, 1::2] 179 180 return unstaggered 181 182 def get_genomes_sample(self): 183 """Returns a random sample of genomes""" 184 if self.ploidy > 1: 185 # The chromosomes get aligned 186 # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] -> 187 # 188 # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3], 189 # [2, 2, 2, 2, 4, 4, 4, 4]] -> 190 # 191 # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1], 192 # (individuals * 2, ...) [2, 2, 2, 2], 193 # [3, 3, 3, 3], 194 # [4, 4, 4, 4]] 195 genomes = ( 196 self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1) 197 ) 198 else: 199 genomes = self.genomes 200 201 # TODO check if change in ploidy has implications for popgen stats 202 203 # Check if there are enough genomes to sample 204 if len(genomes) < 2: # NOTE tajimas_d requires minimum 2 205 return None 206 207 # Sample genomes 208 if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]: 209 indices = np.random.choice( 210 range(genomes.shape[0]), 211 parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE, 212 replace=False, 213 ) 214 return genomes[indices] 215 else: 216 return genomes 217 218 #################################### 219 # OUTPUT: Statistics on population # 220 #################################### 221 222 def get_n(self): 223 """Returns the census population size N""" 224 return self.pop_size_history[-1] 225 226 def get_ne(self): 227 """Returns the effective population size Ne""" 228 return statistics.harmonic_mean(self.pop_size_history) 229 230 def get_allele_frequencies(self): 231 """Returns the frequency of the 1-allele at every position of the genome""" 232 # Aligns the genomes of the population, disregarding chromosomes, and takes the mean 233 return self.genomes.reshape(self.genomes.shape[0], -1).mean(0) 234 235 def get_genotype_frequencies(self): 236 """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]""" 237 if self.ploidy == 1: 238 return self.allele_frequencies 239 240 len_pop = self.genomes.shape[0] 241 242 # Genotype = Sum of alleles at a position -> [0, 1, 2] 243 genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose() 244 245 # Counts the frequencies of 0, 1 and 2 across the population 246 genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop 247 248 return genotype_freqs 249 250 def get_mean_h_per_bit(self): 251 """Returns the mean heterozygosity per bit. 252 Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 253 if self.ploidy == 1: 254 return None 255 256 return self.genotype_frequencies[1::3] 257 258 def get_mean_h_per_locus(self): 259 """Returns the mean heterozygosity per locus. 260 Output: [Hloc1, Hloc2, ...] Entries: nloci""" 261 if self.ploidy == 1: 262 return None 263 264 h_per_bit = self.mean_h_per_bit 265 return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1) 266 267 def get_mean_h(self): 268 """Returns the mean heterozygosity of a population. 269 Output: H""" 270 if self.ploidy == 1: 271 return None 272 273 return self.mean_h_per_bit.mean() 274 275 def get_mean_h_per_bit_expected(self): 276 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 277 Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 278 if self.ploidy == 1: 279 return None 280 281 genotype_freqs_sqrd = self.genotype_frequencies**2 282 sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1) 283 return 1 - sum_each_locus 284 285 def get_mean_h_expected(self): 286 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 287 Output: He""" 288 if self.ploidy == 1: 289 return None 290 291 return self.mean_h_per_bit_expected.mean() 292 293 def get_mu(self, mutation_rates): 294 """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene""" 295 return np.mean(mutation_rates) 296 297 def get_theta(self): 298 """Returns the adjusted mutation rate theta = 4 * Ne * µ, 299 where µ is the mutation rate per gene per generation and Ne is the effective population size""" 300 return (self.ploidy * 2) * self.ne * self.mu 301 302 ############################################## 303 # OUTPUT: Statistics on population or sample # 304 ############################################## 305 306 def get_reference_genome(self, genomes): 307 """Returns the reference genome based on which allele is most common at each position. 308 Equal fractions -> 0""" 309 return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32") 310 311 def get_segregating_sites(self, genomes, ploidy): 312 """Returns the number of segregating sites 313 314 Genomes are first aligned and summed at each site across the population. 315 A site is segregating if its sum is not equal to either 0 or the population size N. 316 """ 317 318 if ploidy == 1: 319 pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0) 320 segr_sites = ( 321 genomes.shape[1] * genomes.shape[2] 322 - (pre_segr_sites == genomes.shape[0]).sum() 323 - (pre_segr_sites == 0).sum() 324 ) 325 326 return segr_sites 327 328 pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0) 329 segr_sites = ( 330 ((genomes.shape[1] * genomes.shape[2]) // 2) 331 - (pre_segr_sites == genomes.shape[0] * 2).sum() 332 - (pre_segr_sites == 0).sum() 333 ) 334 return segr_sites 335 336 ################################ 337 # OUTPUT: Statistics on sample # 338 ################################ 339 340 def get_theta_w(self): 341 """Returns Watterson's estimator theta_w""" 342 return self.segregating_sites_gsample / self.harmonic(self.nsample - 1) 343 344 def get_theta_pi(self): 345 """Returns the estimator theta_pi (based on pairwise differences)""" 346 combs = itertools.combinations(range(self.nsample), 2) 347 348 # Aligns chromosomes and count pairwise differences 349 genomes_sample_flat = self.gsample.reshape(self.nsample, -1) 350 diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs]) 351 total_diffs = diffs.sum() 352 ncomparisons = diffs.size 353 354 return total_diffs / ncomparisons 355 356 def get_tajimas_d(self): 357 """Returns Tajima's D""" 358 359 if self.nsample < 3: 360 return None 361 362 pre_d = self.theta_pi - self.theta_w 363 segr_sites = self.segregating_sites_gsample 364 365 if segr_sites == 0: 366 logging.info("Cannot compute Tajima's D because there are no segregating sites.") 367 return 368 369 a_1 = self.harmonic(self.nsample - 1) 370 a_2 = self.harmonic_sq(self.nsample - 1) 371 b_1 = (self.nsample + 1) / (3 * (self.nsample - 1)) 372 b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1)) 373 c_1 = b_1 - (1 / a_1) 374 c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2)) 375 e_1 = c_1 / a_1 376 e_2 = c_2 / ((a_1**2) + a_2) 377 d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5 378 379 return pre_d / d_stdev 380 381 def get_sfs(self, reference_genome): 382 """Returns the site frequency spectrum (allele frequency spectrum) of a sample""" 383 pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0) 384 pre_sfs[np.nonzero(reference_genome)] -= self.nsample 385 pre_sfs = np.abs(pre_sfs) 386 sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1] # TODO what if len(genomes) < sample_size 387 return sfs 388 389 def get_theta_h(self): 390 """Returns Fay and Wu's estimator theta_h""" 391 # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) ) 392 sfs = self.sfs 393 t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum() 394 return t_h 395 396 def get_fayandwu_h(self): 397 """Returns Fay and Wu's H""" 398 pre_h = self.theta_pi - self.theta_h 399 h_stdev = 1 # TODO: Calculate actual variance of h 400 return pre_h / h_stdev
11class PopgenStats: 12 def __init__(self): 13 self.pop_size_history = [] 14 15 def record_pop_size_history(self, genomes): 16 """Records population sizes at last 1000 steps""" 17 if len(self.pop_size_history) >= 1000: 18 del self.pop_size_history[0] 19 self.pop_size_history.append(len(genomes)) 20 21 def calc(self, input_genomes, mutation_rates): 22 """Calculates all popgen metrics 23 24 Set sample_size value to 0 or -1 to not perform any sampling. 25 """ 26 27 # Infer ploidy from genomes 28 self.ploidy = input_genomes.shape[1] 29 30 # Data to process 31 self.genomes = self.make_3D(input_genomes) 32 self.gsample = self.get_genomes_sample() 33 self.nsample = 0 if self.gsample is None else len(self.gsample) # TODO correct for ploidy? 34 35 # Statistics on population 36 self.n = self.get_n() 37 self.ne = self.get_ne() 38 self.allele_frequencies = self.get_allele_frequencies() 39 self.genotype_frequencies = self.get_genotype_frequencies() 40 self.mean_h_per_bit = self.get_mean_h_per_bit() 41 self.mean_h_per_locus = self.get_mean_h_per_locus() 42 self.mean_h = self.get_mean_h() 43 self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected() 44 self.mean_h_expected = self.get_mean_h_expected() 45 self.mu = self.get_mu(mutation_rates) 46 self.theta = self.get_theta() 47 self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy) 48 self.reference_genome = self.get_reference_genome(self.genomes) 49 50 # Statistics on sample 51 if self.nsample: 52 # TODO gsample is essentially haploid, check if that causes issues 53 self.reference_genome_gsample = self.get_reference_genome(self.gsample) 54 self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1) 55 self.theta_w = self.get_theta_w() 56 self.theta_pi = self.get_theta_pi() 57 self.tajimas_d = self.get_tajimas_d() 58 self.sfs = self.get_sfs(self.reference_genome_gsample) # Uses reference genome calculated from sample 59 self.theta_h = self.get_theta_h() 60 self.fayandwu_h = self.get_fayandwu_h() 61 else: 62 # TODO will cause Recorder to fail? 63 ( 64 self.reference_genome_gsample, 65 self.segregating_sites_gsample, 66 self.theta_w, 67 self.theta_pi, 68 self.tajimas_d, 69 self.sfs, 70 self.theta_h, 71 self.fayandwu_h, 72 ) = [None] * 8 73 74 def emit_simple(self): 75 # TODO add headers 76 # TODO add headers and indices to all output files; and index and column names 77 78 attrs = [ 79 "n", 80 "ne", 81 "mu", 82 "segregating_sites", 83 "segregating_sites_gsample", 84 "theta", 85 "theta_w", 86 "theta_pi", 87 "tajimas_d", 88 "theta_h", 89 "fayandwu_h", 90 ] 91 92 if self.ploidy == 2: 93 attrs += ["mean_h", "mean_h_expected"] 94 95 return {attr: getattr(self, attr) for attr in attrs} 96 97 def emit_complex(self): 98 attrs = [ 99 "allele_frequencies", 100 "genotype_frequencies", 101 "sfs", 102 "reference_genome", 103 "reference_genome_gsample", 104 ] 105 106 if self.ploidy == 2: 107 attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"] 108 109 return {attr: getattr(self, attr) for attr in attrs} 110 111 #################### 112 # HELPER FUNCTIONS # 113 #################### 114 115 @staticmethod 116 def harmonic(i): 117 """Returns the i-th harmonic number""" 118 return (1 / np.arange(1, i + 1)).sum() 119 120 @staticmethod 121 def harmonic_sq(i): 122 """Returns the i-th harmonic square""" 123 return (1 / np.arange(1, i + 1) ** 2).sum() 124 125 @staticmethod 126 def make_3D(input_genomes): 127 """Returns genomes array with merged chromosomes 128 129 Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, 130 the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. 131 If individuals are haploid, all bits belong to one virtual chromosome. 132 133 This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] 134 so that all bits from one chromosome are in a separate array than those in the second chromosome. 135 If individuals are haploid, the chromosome dimension contains only one element. 136 """ 137 138 n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape 139 140 # TODO report when conversion cannot be executed 141 142 if ploidy == 1: 143 return input_genomes[:, 0] 144 145 else: 146 genomes = np.empty( 147 shape=(n_individuals, n_loci, ploidy * bits_per_locus), 148 dtype=np.bool_, 149 ) 150 151 # Odd bits contain bits from chromosome 0 152 genomes[:, :, 0::2] = input_genomes[:, 0] 153 154 # Even bits contain bits from chromosome 1 155 genomes[:, :, 1::2] = input_genomes[:, 1] 156 157 return genomes 158 159 @staticmethod 160 def make_4D(genomes, ploidy): 161 """Returns genomes array with chromosomes split along the second dimension""" 162 163 n_individuals, n_loci, n_bits = genomes.shape 164 bits_per_locus = n_bits // ploidy 165 166 # TODO report when conversion cannot be executed 167 168 unstaggered = np.empty( 169 shape=(n_individuals, ploidy, n_loci, bits_per_locus), 170 dtype=np.bool_, 171 ) 172 173 if ploidy == 1: 174 unstaggered[:, 0] = genomes 175 if ploidy == 2: 176 # Chromosome 0 contains odd bits from input genomes 177 unstaggered[:, 0] = genomes[:, :, 0::2] 178 # Chromosome 1 contains even bits from input genomes 179 unstaggered[:, 1] = genomes[:, :, 1::2] 180 181 return unstaggered 182 183 def get_genomes_sample(self): 184 """Returns a random sample of genomes""" 185 if self.ploidy > 1: 186 # The chromosomes get aligned 187 # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] -> 188 # 189 # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3], 190 # [2, 2, 2, 2, 4, 4, 4, 4]] -> 191 # 192 # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1], 193 # (individuals * 2, ...) [2, 2, 2, 2], 194 # [3, 3, 3, 3], 195 # [4, 4, 4, 4]] 196 genomes = ( 197 self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1) 198 ) 199 else: 200 genomes = self.genomes 201 202 # TODO check if change in ploidy has implications for popgen stats 203 204 # Check if there are enough genomes to sample 205 if len(genomes) < 2: # NOTE tajimas_d requires minimum 2 206 return None 207 208 # Sample genomes 209 if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]: 210 indices = np.random.choice( 211 range(genomes.shape[0]), 212 parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE, 213 replace=False, 214 ) 215 return genomes[indices] 216 else: 217 return genomes 218 219 #################################### 220 # OUTPUT: Statistics on population # 221 #################################### 222 223 def get_n(self): 224 """Returns the census population size N""" 225 return self.pop_size_history[-1] 226 227 def get_ne(self): 228 """Returns the effective population size Ne""" 229 return statistics.harmonic_mean(self.pop_size_history) 230 231 def get_allele_frequencies(self): 232 """Returns the frequency of the 1-allele at every position of the genome""" 233 # Aligns the genomes of the population, disregarding chromosomes, and takes the mean 234 return self.genomes.reshape(self.genomes.shape[0], -1).mean(0) 235 236 def get_genotype_frequencies(self): 237 """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]""" 238 if self.ploidy == 1: 239 return self.allele_frequencies 240 241 len_pop = self.genomes.shape[0] 242 243 # Genotype = Sum of alleles at a position -> [0, 1, 2] 244 genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose() 245 246 # Counts the frequencies of 0, 1 and 2 across the population 247 genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop 248 249 return genotype_freqs 250 251 def get_mean_h_per_bit(self): 252 """Returns the mean heterozygosity per bit. 253 Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 254 if self.ploidy == 1: 255 return None 256 257 return self.genotype_frequencies[1::3] 258 259 def get_mean_h_per_locus(self): 260 """Returns the mean heterozygosity per locus. 261 Output: [Hloc1, Hloc2, ...] Entries: nloci""" 262 if self.ploidy == 1: 263 return None 264 265 h_per_bit = self.mean_h_per_bit 266 return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1) 267 268 def get_mean_h(self): 269 """Returns the mean heterozygosity of a population. 270 Output: H""" 271 if self.ploidy == 1: 272 return None 273 274 return self.mean_h_per_bit.mean() 275 276 def get_mean_h_per_bit_expected(self): 277 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 278 Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 279 if self.ploidy == 1: 280 return None 281 282 genotype_freqs_sqrd = self.genotype_frequencies**2 283 sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1) 284 return 1 - sum_each_locus 285 286 def get_mean_h_expected(self): 287 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 288 Output: He""" 289 if self.ploidy == 1: 290 return None 291 292 return self.mean_h_per_bit_expected.mean() 293 294 def get_mu(self, mutation_rates): 295 """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene""" 296 return np.mean(mutation_rates) 297 298 def get_theta(self): 299 """Returns the adjusted mutation rate theta = 4 * Ne * µ, 300 where µ is the mutation rate per gene per generation and Ne is the effective population size""" 301 return (self.ploidy * 2) * self.ne * self.mu 302 303 ############################################## 304 # OUTPUT: Statistics on population or sample # 305 ############################################## 306 307 def get_reference_genome(self, genomes): 308 """Returns the reference genome based on which allele is most common at each position. 309 Equal fractions -> 0""" 310 return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32") 311 312 def get_segregating_sites(self, genomes, ploidy): 313 """Returns the number of segregating sites 314 315 Genomes are first aligned and summed at each site across the population. 316 A site is segregating if its sum is not equal to either 0 or the population size N. 317 """ 318 319 if ploidy == 1: 320 pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0) 321 segr_sites = ( 322 genomes.shape[1] * genomes.shape[2] 323 - (pre_segr_sites == genomes.shape[0]).sum() 324 - (pre_segr_sites == 0).sum() 325 ) 326 327 return segr_sites 328 329 pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0) 330 segr_sites = ( 331 ((genomes.shape[1] * genomes.shape[2]) // 2) 332 - (pre_segr_sites == genomes.shape[0] * 2).sum() 333 - (pre_segr_sites == 0).sum() 334 ) 335 return segr_sites 336 337 ################################ 338 # OUTPUT: Statistics on sample # 339 ################################ 340 341 def get_theta_w(self): 342 """Returns Watterson's estimator theta_w""" 343 return self.segregating_sites_gsample / self.harmonic(self.nsample - 1) 344 345 def get_theta_pi(self): 346 """Returns the estimator theta_pi (based on pairwise differences)""" 347 combs = itertools.combinations(range(self.nsample), 2) 348 349 # Aligns chromosomes and count pairwise differences 350 genomes_sample_flat = self.gsample.reshape(self.nsample, -1) 351 diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs]) 352 total_diffs = diffs.sum() 353 ncomparisons = diffs.size 354 355 return total_diffs / ncomparisons 356 357 def get_tajimas_d(self): 358 """Returns Tajima's D""" 359 360 if self.nsample < 3: 361 return None 362 363 pre_d = self.theta_pi - self.theta_w 364 segr_sites = self.segregating_sites_gsample 365 366 if segr_sites == 0: 367 logging.info("Cannot compute Tajima's D because there are no segregating sites.") 368 return 369 370 a_1 = self.harmonic(self.nsample - 1) 371 a_2 = self.harmonic_sq(self.nsample - 1) 372 b_1 = (self.nsample + 1) / (3 * (self.nsample - 1)) 373 b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1)) 374 c_1 = b_1 - (1 / a_1) 375 c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2)) 376 e_1 = c_1 / a_1 377 e_2 = c_2 / ((a_1**2) + a_2) 378 d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5 379 380 return pre_d / d_stdev 381 382 def get_sfs(self, reference_genome): 383 """Returns the site frequency spectrum (allele frequency spectrum) of a sample""" 384 pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0) 385 pre_sfs[np.nonzero(reference_genome)] -= self.nsample 386 pre_sfs = np.abs(pre_sfs) 387 sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1] # TODO what if len(genomes) < sample_size 388 return sfs 389 390 def get_theta_h(self): 391 """Returns Fay and Wu's estimator theta_h""" 392 # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) ) 393 sfs = self.sfs 394 t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum() 395 return t_h 396 397 def get_fayandwu_h(self): 398 """Returns Fay and Wu's H""" 399 pre_h = self.theta_pi - self.theta_h 400 h_stdev = 1 # TODO: Calculate actual variance of h 401 return pre_h / h_stdev
15 def record_pop_size_history(self, genomes): 16 """Records population sizes at last 1000 steps""" 17 if len(self.pop_size_history) >= 1000: 18 del self.pop_size_history[0] 19 self.pop_size_history.append(len(genomes))
Records population sizes at last 1000 steps
21 def calc(self, input_genomes, mutation_rates): 22 """Calculates all popgen metrics 23 24 Set sample_size value to 0 or -1 to not perform any sampling. 25 """ 26 27 # Infer ploidy from genomes 28 self.ploidy = input_genomes.shape[1] 29 30 # Data to process 31 self.genomes = self.make_3D(input_genomes) 32 self.gsample = self.get_genomes_sample() 33 self.nsample = 0 if self.gsample is None else len(self.gsample) # TODO correct for ploidy? 34 35 # Statistics on population 36 self.n = self.get_n() 37 self.ne = self.get_ne() 38 self.allele_frequencies = self.get_allele_frequencies() 39 self.genotype_frequencies = self.get_genotype_frequencies() 40 self.mean_h_per_bit = self.get_mean_h_per_bit() 41 self.mean_h_per_locus = self.get_mean_h_per_locus() 42 self.mean_h = self.get_mean_h() 43 self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected() 44 self.mean_h_expected = self.get_mean_h_expected() 45 self.mu = self.get_mu(mutation_rates) 46 self.theta = self.get_theta() 47 self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy) 48 self.reference_genome = self.get_reference_genome(self.genomes) 49 50 # Statistics on sample 51 if self.nsample: 52 # TODO gsample is essentially haploid, check if that causes issues 53 self.reference_genome_gsample = self.get_reference_genome(self.gsample) 54 self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1) 55 self.theta_w = self.get_theta_w() 56 self.theta_pi = self.get_theta_pi() 57 self.tajimas_d = self.get_tajimas_d() 58 self.sfs = self.get_sfs(self.reference_genome_gsample) # Uses reference genome calculated from sample 59 self.theta_h = self.get_theta_h() 60 self.fayandwu_h = self.get_fayandwu_h() 61 else: 62 # TODO will cause Recorder to fail? 63 ( 64 self.reference_genome_gsample, 65 self.segregating_sites_gsample, 66 self.theta_w, 67 self.theta_pi, 68 self.tajimas_d, 69 self.sfs, 70 self.theta_h, 71 self.fayandwu_h, 72 ) = [None] * 8
Calculates all popgen metrics
Set sample_size value to 0 or -1 to not perform any sampling.
74 def emit_simple(self): 75 # TODO add headers 76 # TODO add headers and indices to all output files; and index and column names 77 78 attrs = [ 79 "n", 80 "ne", 81 "mu", 82 "segregating_sites", 83 "segregating_sites_gsample", 84 "theta", 85 "theta_w", 86 "theta_pi", 87 "tajimas_d", 88 "theta_h", 89 "fayandwu_h", 90 ] 91 92 if self.ploidy == 2: 93 attrs += ["mean_h", "mean_h_expected"] 94 95 return {attr: getattr(self, attr) for attr in attrs}
97 def emit_complex(self): 98 attrs = [ 99 "allele_frequencies", 100 "genotype_frequencies", 101 "sfs", 102 "reference_genome", 103 "reference_genome_gsample", 104 ] 105 106 if self.ploidy == 2: 107 attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"] 108 109 return {attr: getattr(self, attr) for attr in attrs}
115 @staticmethod 116 def harmonic(i): 117 """Returns the i-th harmonic number""" 118 return (1 / np.arange(1, i + 1)).sum()
Returns the i-th harmonic number
120 @staticmethod 121 def harmonic_sq(i): 122 """Returns the i-th harmonic square""" 123 return (1 / np.arange(1, i + 1) ** 2).sum()
Returns the i-th harmonic square
125 @staticmethod 126 def make_3D(input_genomes): 127 """Returns genomes array with merged chromosomes 128 129 Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, 130 the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. 131 If individuals are haploid, all bits belong to one virtual chromosome. 132 133 This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] 134 so that all bits from one chromosome are in a separate array than those in the second chromosome. 135 If individuals are haploid, the chromosome dimension contains only one element. 136 """ 137 138 n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape 139 140 # TODO report when conversion cannot be executed 141 142 if ploidy == 1: 143 return input_genomes[:, 0] 144 145 else: 146 genomes = np.empty( 147 shape=(n_individuals, n_loci, ploidy * bits_per_locus), 148 dtype=np.bool_, 149 ) 150 151 # Odd bits contain bits from chromosome 0 152 genomes[:, :, 0::2] = input_genomes[:, 0] 153 154 # Even bits contain bits from chromosome 1 155 genomes[:, :, 1::2] = input_genomes[:, 1] 156 157 return genomes
Returns genomes array with merged chromosomes
Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. If individuals are haploid, all bits belong to one virtual chromosome.
This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] so that all bits from one chromosome are in a separate array than those in the second chromosome. If individuals are haploid, the chromosome dimension contains only one element.
159 @staticmethod 160 def make_4D(genomes, ploidy): 161 """Returns genomes array with chromosomes split along the second dimension""" 162 163 n_individuals, n_loci, n_bits = genomes.shape 164 bits_per_locus = n_bits // ploidy 165 166 # TODO report when conversion cannot be executed 167 168 unstaggered = np.empty( 169 shape=(n_individuals, ploidy, n_loci, bits_per_locus), 170 dtype=np.bool_, 171 ) 172 173 if ploidy == 1: 174 unstaggered[:, 0] = genomes 175 if ploidy == 2: 176 # Chromosome 0 contains odd bits from input genomes 177 unstaggered[:, 0] = genomes[:, :, 0::2] 178 # Chromosome 1 contains even bits from input genomes 179 unstaggered[:, 1] = genomes[:, :, 1::2] 180 181 return unstaggered
Returns genomes array with chromosomes split along the second dimension
183 def get_genomes_sample(self): 184 """Returns a random sample of genomes""" 185 if self.ploidy > 1: 186 # The chromosomes get aligned 187 # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] -> 188 # 189 # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3], 190 # [2, 2, 2, 2, 4, 4, 4, 4]] -> 191 # 192 # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1], 193 # (individuals * 2, ...) [2, 2, 2, 2], 194 # [3, 3, 3, 3], 195 # [4, 4, 4, 4]] 196 genomes = ( 197 self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1) 198 ) 199 else: 200 genomes = self.genomes 201 202 # TODO check if change in ploidy has implications for popgen stats 203 204 # Check if there are enough genomes to sample 205 if len(genomes) < 2: # NOTE tajimas_d requires minimum 2 206 return None 207 208 # Sample genomes 209 if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]: 210 indices = np.random.choice( 211 range(genomes.shape[0]), 212 parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE, 213 replace=False, 214 ) 215 return genomes[indices] 216 else: 217 return genomes
Returns a random sample of genomes
223 def get_n(self): 224 """Returns the census population size N""" 225 return self.pop_size_history[-1]
Returns the census population size N
227 def get_ne(self): 228 """Returns the effective population size Ne""" 229 return statistics.harmonic_mean(self.pop_size_history)
Returns the effective population size Ne
231 def get_allele_frequencies(self): 232 """Returns the frequency of the 1-allele at every position of the genome""" 233 # Aligns the genomes of the population, disregarding chromosomes, and takes the mean 234 return self.genomes.reshape(self.genomes.shape[0], -1).mean(0)
Returns the frequency of the 1-allele at every position of the genome
236 def get_genotype_frequencies(self): 237 """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]""" 238 if self.ploidy == 1: 239 return self.allele_frequencies 240 241 len_pop = self.genomes.shape[0] 242 243 # Genotype = Sum of alleles at a position -> [0, 1, 2] 244 genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose() 245 246 # Counts the frequencies of 0, 1 and 2 across the population 247 genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop 248 249 return genotype_freqs
Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]
251 def get_mean_h_per_bit(self): 252 """Returns the mean heterozygosity per bit. 253 Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 254 if self.ploidy == 1: 255 return None 256 257 return self.genotype_frequencies[1::3]
Returns the mean heterozygosity per bit. Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci
259 def get_mean_h_per_locus(self): 260 """Returns the mean heterozygosity per locus. 261 Output: [Hloc1, Hloc2, ...] Entries: nloci""" 262 if self.ploidy == 1: 263 return None 264 265 h_per_bit = self.mean_h_per_bit 266 return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1)
Returns the mean heterozygosity per locus. Output: [Hloc1, Hloc2, ...] Entries: nloci
268 def get_mean_h(self): 269 """Returns the mean heterozygosity of a population. 270 Output: H""" 271 if self.ploidy == 1: 272 return None 273 274 return self.mean_h_per_bit.mean()
Returns the mean heterozygosity of a population. Output: H
276 def get_mean_h_per_bit_expected(self): 277 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 278 Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci""" 279 if self.ploidy == 1: 280 return None 281 282 genotype_freqs_sqrd = self.genotype_frequencies**2 283 sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1) 284 return 1 - sum_each_locus
Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci
286 def get_mean_h_expected(self): 287 """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. 288 Output: He""" 289 if self.ploidy == 1: 290 return None 291 292 return self.mean_h_per_bit_expected.mean()
Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: He
294 def get_mu(self, mutation_rates): 295 """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene""" 296 return np.mean(mutation_rates)
Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene
298 def get_theta(self): 299 """Returns the adjusted mutation rate theta = 4 * Ne * µ, 300 where µ is the mutation rate per gene per generation and Ne is the effective population size""" 301 return (self.ploidy * 2) * self.ne * self.mu
Returns the adjusted mutation rate theta = 4 * Ne * µ, where µ is the mutation rate per gene per generation and Ne is the effective population size
307 def get_reference_genome(self, genomes): 308 """Returns the reference genome based on which allele is most common at each position. 309 Equal fractions -> 0""" 310 return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32")
Returns the reference genome based on which allele is most common at each position. Equal fractions -> 0
312 def get_segregating_sites(self, genomes, ploidy): 313 """Returns the number of segregating sites 314 315 Genomes are first aligned and summed at each site across the population. 316 A site is segregating if its sum is not equal to either 0 or the population size N. 317 """ 318 319 if ploidy == 1: 320 pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0) 321 segr_sites = ( 322 genomes.shape[1] * genomes.shape[2] 323 - (pre_segr_sites == genomes.shape[0]).sum() 324 - (pre_segr_sites == 0).sum() 325 ) 326 327 return segr_sites 328 329 pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0) 330 segr_sites = ( 331 ((genomes.shape[1] * genomes.shape[2]) // 2) 332 - (pre_segr_sites == genomes.shape[0] * 2).sum() 333 - (pre_segr_sites == 0).sum() 334 ) 335 return segr_sites
Returns the number of segregating sites
Genomes are first aligned and summed at each site across the population. A site is segregating if its sum is not equal to either 0 or the population size N.
341 def get_theta_w(self): 342 """Returns Watterson's estimator theta_w""" 343 return self.segregating_sites_gsample / self.harmonic(self.nsample - 1)
Returns Watterson's estimator theta_w
345 def get_theta_pi(self): 346 """Returns the estimator theta_pi (based on pairwise differences)""" 347 combs = itertools.combinations(range(self.nsample), 2) 348 349 # Aligns chromosomes and count pairwise differences 350 genomes_sample_flat = self.gsample.reshape(self.nsample, -1) 351 diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs]) 352 total_diffs = diffs.sum() 353 ncomparisons = diffs.size 354 355 return total_diffs / ncomparisons
Returns the estimator theta_pi (based on pairwise differences)
357 def get_tajimas_d(self): 358 """Returns Tajima's D""" 359 360 if self.nsample < 3: 361 return None 362 363 pre_d = self.theta_pi - self.theta_w 364 segr_sites = self.segregating_sites_gsample 365 366 if segr_sites == 0: 367 logging.info("Cannot compute Tajima's D because there are no segregating sites.") 368 return 369 370 a_1 = self.harmonic(self.nsample - 1) 371 a_2 = self.harmonic_sq(self.nsample - 1) 372 b_1 = (self.nsample + 1) / (3 * (self.nsample - 1)) 373 b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1)) 374 c_1 = b_1 - (1 / a_1) 375 c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2)) 376 e_1 = c_1 / a_1 377 e_2 = c_2 / ((a_1**2) + a_2) 378 d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5 379 380 return pre_d / d_stdev
Returns Tajima's D
382 def get_sfs(self, reference_genome): 383 """Returns the site frequency spectrum (allele frequency spectrum) of a sample""" 384 pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0) 385 pre_sfs[np.nonzero(reference_genome)] -= self.nsample 386 pre_sfs = np.abs(pre_sfs) 387 sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1] # TODO what if len(genomes) < sample_size 388 return sfs
Returns the site frequency spectrum (allele frequency spectrum) of a sample
390 def get_theta_h(self): 391 """Returns Fay and Wu's estimator theta_h""" 392 # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) ) 393 sfs = self.sfs 394 t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum() 395 return t_h
Returns Fay and Wu's estimator theta_h