aegis_sim.utilities.popgenstats API documentation

pop_size_history

def record_pop_size_history(self, genomes): View Source

15    def record_pop_size_history(self, genomes):
16        """Records population sizes at last 1000 steps"""
17        if len(self.pop_size_history) >= 1000:
18            del self.pop_size_history[0]
19        self.pop_size_history.append(len(genomes))

Records population sizes at last 1000 steps

def calc(self, input_genomes, mutation_rates): View Source

21    def calc(self, input_genomes, mutation_rates):
22        """Calculates all popgen metrics
23
24        Set sample_size value to 0 or -1 to not perform any sampling.
25        """
26
27        # Infer ploidy from genomes
28        self.ploidy = input_genomes.shape[1]
29
30        # Data to process
31        self.genomes = self.make_3D(input_genomes)
32        self.gsample = self.get_genomes_sample()
33        self.nsample = 0 if self.gsample is None else len(self.gsample)  # TODO correct for ploidy?
34
35        # Statistics on population
36        self.n = self.get_n()
37        self.ne = self.get_ne()
38        self.allele_frequencies = self.get_allele_frequencies()
39        self.genotype_frequencies = self.get_genotype_frequencies()
40        self.mean_h_per_bit = self.get_mean_h_per_bit()
41        self.mean_h_per_locus = self.get_mean_h_per_locus()
42        self.mean_h = self.get_mean_h()
43        self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected()
44        self.mean_h_expected = self.get_mean_h_expected()
45        self.mu = self.get_mu(mutation_rates)
46        self.theta = self.get_theta()
47        self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy)
48        self.reference_genome = self.get_reference_genome(self.genomes)
49
50        # Statistics on sample
51        if self.nsample:
52            # TODO gsample is essentially haploid, check if that causes issues
53            self.reference_genome_gsample = self.get_reference_genome(self.gsample)
54            self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1)
55            self.theta_w = self.get_theta_w()
56            self.theta_pi = self.get_theta_pi()
57            self.tajimas_d = self.get_tajimas_d()
58            self.sfs = self.get_sfs(self.reference_genome_gsample)  # Uses reference genome calculated from sample
59            self.theta_h = self.get_theta_h()
60            self.fayandwu_h = self.get_fayandwu_h()
61        else:
62            # TODO will cause Recorder to fail?
63            (
64                self.reference_genome_gsample,
65                self.segregating_sites_gsample,
66                self.theta_w,
67                self.theta_pi,
68                self.tajimas_d,
69                self.sfs,
70                self.theta_h,
71                self.fayandwu_h,
72            ) = [None] * 8

Calculates all popgen metrics

Set sample_size value to 0 or -1 to not perform any sampling.

def emit_simple(self): View Source

74    def emit_simple(self):
75        # TODO add headers
76        # TODO add headers and indices to all output files; and index and column names
77
78        attrs = [
79            "n",
80            "ne",
81            "mu",
82            "segregating_sites",
83            "segregating_sites_gsample",
84            "theta",
85            "theta_w",
86            "theta_pi",
87            "tajimas_d",
88            "theta_h",
89            "fayandwu_h",
90        ]
91
92        if self.ploidy == 2:
93            attrs += ["mean_h", "mean_h_expected"]
94
95        return {attr: getattr(self, attr) for attr in attrs}

def emit_complex(self): View Source

 97    def emit_complex(self):
 98        attrs = [
 99            "allele_frequencies",
100            "genotype_frequencies",
101            "sfs",
102            "reference_genome",
103            "reference_genome_gsample",
104        ]
105
106        if self.ploidy == 2:
107            attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"]
108
109        return {attr: getattr(self, attr) for attr in attrs}

@staticmethod

def harmonic(i): View Source

115    @staticmethod
116    def harmonic(i):
117        """Returns the i-th harmonic number"""
118        return (1 / np.arange(1, i + 1)).sum()

Returns the i-th harmonic number

@staticmethod

def harmonic_sq(i): View Source

120    @staticmethod
121    def harmonic_sq(i):
122        """Returns the i-th harmonic square"""
123        return (1 / np.arange(1, i + 1) ** 2).sum()

Returns the i-th harmonic square

@staticmethod

def make_3D(input_genomes): View Source

125    @staticmethod
126    def make_3D(input_genomes):
127        """Returns genomes array with merged chromosomes
128
129        Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid,
130        the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome.
131        If individuals are haploid, all bits belong to one virtual chromosome.
132
133        This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit]
134        so that all bits from one chromosome are in a separate array than those in the second chromosome.
135        If individuals are haploid, the chromosome dimension contains only one element.
136        """
137
138        n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape
139
140        # TODO report when conversion cannot be executed
141
142        if ploidy == 1:
143            return input_genomes[:, 0]
144
145        else:
146            genomes = np.empty(
147                shape=(n_individuals, n_loci, ploidy * bits_per_locus),
148                dtype=np.bool_,
149            )
150
151            # Odd bits contain bits from chromosome 0
152            genomes[:, :, 0::2] = input_genomes[:, 0]
153
154            # Even bits contain bits from chromosome 1
155            genomes[:, :, 1::2] = input_genomes[:, 1]
156
157        return genomes

Returns genomes array with merged chromosomes

Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. If individuals are haploid, all bits belong to one virtual chromosome.

This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] so that all bits from one chromosome are in a separate array than those in the second chromosome. If individuals are haploid, the chromosome dimension contains only one element.

@staticmethod

def make_4D(genomes, ploidy): View Source

159    @staticmethod
160    def make_4D(genomes, ploidy):
161        """Returns genomes array with chromosomes split along the second dimension"""
162
163        n_individuals, n_loci, n_bits = genomes.shape
164        bits_per_locus = n_bits // ploidy
165
166        # TODO report when conversion cannot be executed
167
168        unstaggered = np.empty(
169            shape=(n_individuals, ploidy, n_loci, bits_per_locus),
170            dtype=np.bool_,
171        )
172
173        if ploidy == 1:
174            unstaggered[:, 0] = genomes
175        if ploidy == 2:
176            # Chromosome 0 contains odd bits from input genomes
177            unstaggered[:, 0] = genomes[:, :, 0::2]
178            # Chromosome 1 contains even bits from input genomes
179            unstaggered[:, 1] = genomes[:, :, 1::2]
180
181        return unstaggered

Returns genomes array with chromosomes split along the second dimension

def get_genomes_sample(self): View Source

183    def get_genomes_sample(self):
184        """Returns a random sample of genomes"""
185        if self.ploidy > 1:
186            # The chromosomes get aligned
187            # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] ->
188            #
189            # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3],
190            #                      [2, 2, 2, 2, 4, 4, 4, 4]] ->
191            #
192            # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1],
193            #    (individuals * 2, ...)           [2, 2, 2, 2],
194            #                                     [3, 3, 3, 3],
195            #                                     [4, 4, 4, 4]]
196            genomes = (
197                self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1)
198            )
199        else:
200            genomes = self.genomes
201
202        # TODO check if change in ploidy has implications for popgen stats
203
204        # Check if there are enough genomes to sample
205        if len(genomes) < 2:  # NOTE tajimas_d requires minimum 2
206            return None
207
208        # Sample genomes
209        if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]:
210            indices = np.random.choice(
211                range(genomes.shape[0]),
212                parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE,
213                replace=False,
214            )
215            return genomes[indices]
216        else:
217            return genomes

Returns a random sample of genomes

def get_n(self): View Source

223    def get_n(self):
224        """Returns the census population size N"""
225        return self.pop_size_history[-1]

Returns the census population size N

def get_ne(self): View Source

227    def get_ne(self):
228        """Returns the effective population size Ne"""
229        return statistics.harmonic_mean(self.pop_size_history)

Returns the effective population size Ne

def get_allele_frequencies(self): View Source

231    def get_allele_frequencies(self):
232        """Returns the frequency of the 1-allele at every position of the genome"""
233        # Aligns the genomes of the population, disregarding chromosomes, and takes the mean
234        return self.genomes.reshape(self.genomes.shape[0], -1).mean(0)

Returns the frequency of the 1-allele at every position of the genome

def get_genotype_frequencies(self): View Source

236    def get_genotype_frequencies(self):
237        """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]"""
238        if self.ploidy == 1:
239            return self.allele_frequencies
240
241        len_pop = self.genomes.shape[0]
242
243        # Genotype = Sum of alleles at a position -> [0, 1, 2]
244        genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose()
245
246        # Counts the frequencies of 0, 1 and 2 across the population
247        genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop
248
249        return genotype_freqs

Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]

def get_mean_h_per_bit(self): View Source

251    def get_mean_h_per_bit(self):
252        """Returns the mean heterozygosity per bit.
253        Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
254        if self.ploidy == 1:
255            return None
256
257        return self.genotype_frequencies[1::3]

Returns the mean heterozygosity per bit. Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci

def get_mean_h_per_locus(self): View Source

259    def get_mean_h_per_locus(self):
260        """Returns the mean heterozygosity per locus.
261        Output: [Hloc1, Hloc2, ...] Entries: nloci"""
262        if self.ploidy == 1:
263            return None
264
265        h_per_bit = self.mean_h_per_bit
266        return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1)

Returns the mean heterozygosity per locus. Output: [Hloc1, Hloc2, ...] Entries: nloci

def get_mean_h(self): View Source

268    def get_mean_h(self):
269        """Returns the mean heterozygosity of a population.
270        Output: H"""
271        if self.ploidy == 1:
272            return None
273
274        return self.mean_h_per_bit.mean()

Returns the mean heterozygosity of a population. Output: H

def get_mean_h_per_bit_expected(self): View Source

276    def get_mean_h_per_bit_expected(self):
277        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
278        Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
279        if self.ploidy == 1:
280            return None
281
282        genotype_freqs_sqrd = self.genotype_frequencies**2
283        sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1)
284        return 1 - sum_each_locus

Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci

def get_mean_h_expected(self): View Source

286    def get_mean_h_expected(self):
287        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
288        Output: He"""
289        if self.ploidy == 1:
290            return None
291
292        return self.mean_h_per_bit_expected.mean()

Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: He

def get_mu(self, mutation_rates): View Source

294    def get_mu(self, mutation_rates):
295        """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene"""
296        return np.mean(mutation_rates)

Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene

def get_theta(self): View Source

298    def get_theta(self):
299        """Returns the adjusted mutation rate theta = 4 * Ne * µ,
300        where µ is the mutation rate per gene per generation and Ne is the effective population size"""
301        return (self.ploidy * 2) * self.ne * self.mu

Returns the adjusted mutation rate theta = 4 * Ne * µ, where µ is the mutation rate per gene per generation and Ne is the effective population size

def get_reference_genome(self, genomes): View Source

307    def get_reference_genome(self, genomes):
308        """Returns the reference genome based on which allele is most common at each position.
309        Equal fractions -> 0"""
310        return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32")

Returns the reference genome based on which allele is most common at each position. Equal fractions -> 0

def get_segregating_sites(self, genomes, ploidy): View Source

312    def get_segregating_sites(self, genomes, ploidy):
313        """Returns the number of segregating sites
314
315        Genomes are first aligned and summed at each site across the population.
316        A site is segregating if its sum is not equal to either 0 or the population size N.
317        """
318
319        if ploidy == 1:
320            pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0)
321            segr_sites = (
322                genomes.shape[1] * genomes.shape[2]
323                - (pre_segr_sites == genomes.shape[0]).sum()
324                - (pre_segr_sites == 0).sum()
325            )
326
327            return segr_sites
328
329        pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0)
330        segr_sites = (
331            ((genomes.shape[1] * genomes.shape[2]) // 2)
332            - (pre_segr_sites == genomes.shape[0] * 2).sum()
333            - (pre_segr_sites == 0).sum()
334        )
335        return segr_sites

Returns the number of segregating sites

Genomes are first aligned and summed at each site across the population. A site is segregating if its sum is not equal to either 0 or the population size N.

def get_theta_w(self): View Source

341    def get_theta_w(self):
342        """Returns Watterson's estimator theta_w"""
343        return self.segregating_sites_gsample / self.harmonic(self.nsample - 1)

Returns Watterson's estimator theta_w

def get_theta_pi(self): View Source

345    def get_theta_pi(self):
346        """Returns the estimator theta_pi (based on pairwise differences)"""
347        combs = itertools.combinations(range(self.nsample), 2)
348
349        # Aligns chromosomes and count pairwise differences
350        genomes_sample_flat = self.gsample.reshape(self.nsample, -1)
351        diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs])
352        total_diffs = diffs.sum()
353        ncomparisons = diffs.size
354
355        return total_diffs / ncomparisons

Returns the estimator theta_pi (based on pairwise differences)

def get_tajimas_d(self): View Source

357    def get_tajimas_d(self):
358        """Returns Tajima's D"""
359
360        if self.nsample < 3:
361            return None
362
363        pre_d = self.theta_pi - self.theta_w
364        segr_sites = self.segregating_sites_gsample
365
366        if segr_sites == 0:
367            logging.info("Cannot compute Tajima's D because there are no segregating sites.")
368            return
369
370        a_1 = self.harmonic(self.nsample - 1)
371        a_2 = self.harmonic_sq(self.nsample - 1)
372        b_1 = (self.nsample + 1) / (3 * (self.nsample - 1))
373        b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1))
374        c_1 = b_1 - (1 / a_1)
375        c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2))
376        e_1 = c_1 / a_1
377        e_2 = c_2 / ((a_1**2) + a_2)
378        d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5
379
380        return pre_d / d_stdev

Returns Tajima's D

def get_sfs(self, reference_genome): View Source

382    def get_sfs(self, reference_genome):
383        """Returns the site frequency spectrum (allele frequency spectrum) of a sample"""
384        pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0)
385        pre_sfs[np.nonzero(reference_genome)] -= self.nsample
386        pre_sfs = np.abs(pre_sfs)
387        sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1]  # TODO what if len(genomes) < sample_size
388        return sfs

Returns the site frequency spectrum (allele frequency spectrum) of a sample

def get_theta_h(self): View Source

390    def get_theta_h(self):
391        """Returns Fay and Wu's estimator theta_h"""
392        # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) )
393        sfs = self.sfs
394        t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum()
395        return t_h

Returns Fay and Wu's estimator theta_h

def get_fayandwu_h(self): View Source

397    def get_fayandwu_h(self):
398        """Returns Fay and Wu's H"""
399        pre_h = self.theta_pi - self.theta_h
400        h_stdev = 1  # TODO: Calculate actual variance of h
401        return pre_h / h_stdev

Returns Fay and Wu's H