aegis_sim.utilities.popgenstats

Contains functions for the computation of relevant population genetic metrics

  1"""Contains functions for the computation of relevant population genetic metrics"""
  2
  3import statistics
  4import itertools
  5import logging
  6import numpy as np
  7
  8from aegis_sim.parameterization import parametermanager
  9
 10class PopgenStats:
 11    def __init__(self):
 12        self.pop_size_history = []
 13
 14    def record_pop_size_history(self, genomes):
 15        """Records population sizes at last 1000 steps"""
 16        if len(self.pop_size_history) >= 1000:
 17            del self.pop_size_history[0]
 18        self.pop_size_history.append(len(genomes))
 19
 20    def calc(self, input_genomes, mutation_rates):
 21        """Calculates all popgen metrics
 22
 23        Set sample_size value to 0 or -1 to not perform any sampling.
 24        """
 25
 26        # Infer ploidy from genomes
 27        self.ploidy = input_genomes.shape[1]
 28
 29        # Data to process
 30        self.genomes = self.make_3D(input_genomes)
 31        self.gsample = self.get_genomes_sample()
 32        self.nsample = 0 if self.gsample is None else len(self.gsample)  # TODO correct for ploidy?
 33
 34        # Statistics on population
 35        self.n = self.get_n()
 36        self.ne = self.get_ne()
 37        self.allele_frequencies = self.get_allele_frequencies()
 38        self.genotype_frequencies = self.get_genotype_frequencies()
 39        self.mean_h_per_bit = self.get_mean_h_per_bit()
 40        self.mean_h_per_locus = self.get_mean_h_per_locus()
 41        self.mean_h = self.get_mean_h()
 42        self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected()
 43        self.mean_h_expected = self.get_mean_h_expected()
 44        self.mu = self.get_mu(mutation_rates)
 45        self.theta = self.get_theta()
 46        self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy)
 47        self.reference_genome = self.get_reference_genome(self.genomes)
 48
 49        # Statistics on sample
 50        if self.nsample:
 51            # TODO gsample is essentially haploid, check if that causes issues
 52            self.reference_genome_gsample = self.get_reference_genome(self.gsample)
 53            self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1)
 54            self.theta_w = self.get_theta_w()
 55            self.theta_pi = self.get_theta_pi()
 56            self.tajimas_d = self.get_tajimas_d()
 57            self.sfs = self.get_sfs(self.reference_genome_gsample)  # Uses reference genome calculated from sample
 58            self.theta_h = self.get_theta_h()
 59            self.fayandwu_h = self.get_fayandwu_h()
 60        else:
 61            # TODO will cause Recorder to fail?
 62            (
 63                self.reference_genome_gsample,
 64                self.segregating_sites_gsample,
 65                self.theta_w,
 66                self.theta_pi,
 67                self.tajimas_d,
 68                self.sfs,
 69                self.theta_h,
 70                self.fayandwu_h,
 71            ) = [None] * 8
 72
 73    def emit_simple(self):
 74        # TODO add headers
 75        # TODO add headers and indices to all output files; and index and column names
 76
 77        attrs = [
 78            "n",
 79            "ne",
 80            "mu",
 81            "segregating_sites",
 82            "segregating_sites_gsample",
 83            "theta",
 84            "theta_w",
 85            "theta_pi",
 86            "tajimas_d",
 87            "theta_h",
 88            "fayandwu_h",
 89        ]
 90
 91        if self.ploidy == 2:
 92            attrs += ["mean_h", "mean_h_expected"]
 93
 94        return {attr: getattr(self, attr) for attr in attrs}
 95
 96    def emit_complex(self):
 97        attrs = [
 98            "allele_frequencies",
 99            "genotype_frequencies",
100            "sfs",
101            "reference_genome",
102            "reference_genome_gsample",
103        ]
104
105        if self.ploidy == 2:
106            attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"]
107
108        return {attr: getattr(self, attr) for attr in attrs}
109
110    ####################
111    # HELPER FUNCTIONS #
112    ####################
113
114    @staticmethod
115    def harmonic(i):
116        """Returns the i-th harmonic number"""
117        return (1 / np.arange(1, i + 1)).sum()
118
119    @staticmethod
120    def harmonic_sq(i):
121        """Returns the i-th harmonic square"""
122        return (1 / np.arange(1, i + 1) ** 2).sum()
123
124    @staticmethod
125    def make_3D(input_genomes):
126        """Returns genomes array with merged chromosomes
127
128        Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid,
129        the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome.
130        If individuals are haploid, all bits belong to one virtual chromosome.
131
132        This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit]
133        so that all bits from one chromosome are in a separate array than those in the second chromosome.
134        If individuals are haploid, the chromosome dimension contains only one element.
135        """
136
137        n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape
138
139        # TODO report when conversion cannot be executed
140
141        if ploidy == 1:
142            return input_genomes[:, 0]
143
144        else:
145            genomes = np.empty(
146                shape=(n_individuals, n_loci, ploidy * bits_per_locus),
147                dtype=np.bool_,
148            )
149
150            # Odd bits contain bits from chromosome 0
151            genomes[:, :, 0::2] = input_genomes[:, 0]
152
153            # Even bits contain bits from chromosome 1
154            genomes[:, :, 1::2] = input_genomes[:, 1]
155
156        return genomes
157
158    @staticmethod
159    def make_4D(genomes, ploidy):
160        """Returns genomes array with chromosomes split along the second dimension"""
161
162        n_individuals, n_loci, n_bits = genomes.shape
163        bits_per_locus = n_bits // ploidy
164
165        # TODO report when conversion cannot be executed
166
167        unstaggered = np.empty(
168            shape=(n_individuals, ploidy, n_loci, bits_per_locus),
169            dtype=np.bool_,
170        )
171
172        if ploidy == 1:
173            unstaggered[:, 0] = genomes
174        if ploidy == 2:
175            # Chromosome 0 contains odd bits from input genomes
176            unstaggered[:, 0] = genomes[:, :, 0::2]
177            # Chromosome 1 contains even bits from input genomes
178            unstaggered[:, 1] = genomes[:, :, 1::2]
179
180        return unstaggered
181
182    def get_genomes_sample(self):
183        """Returns a random sample of genomes"""
184        if self.ploidy > 1:
185            # The chromosomes get aligned
186            # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] ->
187            #
188            # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3],
189            #                      [2, 2, 2, 2, 4, 4, 4, 4]] ->
190            #
191            # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1],
192            #    (individuals * 2, ...)           [2, 2, 2, 2],
193            #                                     [3, 3, 3, 3],
194            #                                     [4, 4, 4, 4]]
195            genomes = (
196                self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1)
197            )
198        else:
199            genomes = self.genomes
200
201        # TODO check if change in ploidy has implications for popgen stats
202
203        # Check if there are enough genomes to sample
204        if len(genomes) < 2:  # NOTE tajimas_d requires minimum 2
205            return None
206
207        # Sample genomes
208        if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]:
209            indices = np.random.choice(
210                range(genomes.shape[0]),
211                parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE,
212                replace=False,
213            )
214            return genomes[indices]
215        else:
216            return genomes
217
218    ####################################
219    # OUTPUT: Statistics on population #
220    ####################################
221
222    def get_n(self):
223        """Returns the census population size N"""
224        return self.pop_size_history[-1]
225
226    def get_ne(self):
227        """Returns the effective population size Ne"""
228        return statistics.harmonic_mean(self.pop_size_history)
229
230    def get_allele_frequencies(self):
231        """Returns the frequency of the 1-allele at every position of the genome"""
232        # Aligns the genomes of the population, disregarding chromosomes, and takes the mean
233        return self.genomes.reshape(self.genomes.shape[0], -1).mean(0)
234
235    def get_genotype_frequencies(self):
236        """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]"""
237        if self.ploidy == 1:
238            return self.allele_frequencies
239
240        len_pop = self.genomes.shape[0]
241
242        # Genotype = Sum of alleles at a position -> [0, 1, 2]
243        genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose()
244
245        # Counts the frequencies of 0, 1 and 2 across the population
246        genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop
247
248        return genotype_freqs
249
250    def get_mean_h_per_bit(self):
251        """Returns the mean heterozygosity per bit.
252        Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
253        if self.ploidy == 1:
254            return None
255
256        return self.genotype_frequencies[1::3]
257
258    def get_mean_h_per_locus(self):
259        """Returns the mean heterozygosity per locus.
260        Output: [Hloc1, Hloc2, ...] Entries: nloci"""
261        if self.ploidy == 1:
262            return None
263
264        h_per_bit = self.mean_h_per_bit
265        return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1)
266
267    def get_mean_h(self):
268        """Returns the mean heterozygosity of a population.
269        Output: H"""
270        if self.ploidy == 1:
271            return None
272
273        return self.mean_h_per_bit.mean()
274
275    def get_mean_h_per_bit_expected(self):
276        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
277        Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
278        if self.ploidy == 1:
279            return None
280
281        genotype_freqs_sqrd = self.genotype_frequencies**2
282        sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1)
283        return 1 - sum_each_locus
284
285    def get_mean_h_expected(self):
286        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
287        Output: He"""
288        if self.ploidy == 1:
289            return None
290
291        return self.mean_h_per_bit_expected.mean()
292
293    def get_mu(self, mutation_rates):
294        """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene"""
295        return np.mean(mutation_rates)
296
297    def get_theta(self):
298        """Returns the adjusted mutation rate theta = 4 * Ne * µ,
299        where µ is the mutation rate per gene per generation and Ne is the effective population size"""
300        return (self.ploidy * 2) * self.ne * self.mu
301
302    ##############################################
303    # OUTPUT: Statistics on population or sample #
304    ##############################################
305
306    def get_reference_genome(self, genomes):
307        """Returns the reference genome based on which allele is most common at each position.
308        Equal fractions -> 0"""
309        return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32")
310
311    def get_segregating_sites(self, genomes, ploidy):
312        """Returns the number of segregating sites
313
314        Genomes are first aligned and summed at each site across the population.
315        A site is segregating if its sum is not equal to either 0 or the population size N.
316        """
317
318        if ploidy == 1:
319            pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0)
320            segr_sites = (
321                genomes.shape[1] * genomes.shape[2]
322                - (pre_segr_sites == genomes.shape[0]).sum()
323                - (pre_segr_sites == 0).sum()
324            )
325
326            return segr_sites
327
328        pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0)
329        segr_sites = (
330            ((genomes.shape[1] * genomes.shape[2]) // 2)
331            - (pre_segr_sites == genomes.shape[0] * 2).sum()
332            - (pre_segr_sites == 0).sum()
333        )
334        return segr_sites
335
336    ################################
337    # OUTPUT: Statistics on sample #
338    ################################
339
340    def get_theta_w(self):
341        """Returns Watterson's estimator theta_w"""
342        return self.segregating_sites_gsample / self.harmonic(self.nsample - 1)
343
344    def get_theta_pi(self):
345        """Returns the estimator theta_pi (based on pairwise differences)"""
346        combs = itertools.combinations(range(self.nsample), 2)
347
348        # Aligns chromosomes and count pairwise differences
349        genomes_sample_flat = self.gsample.reshape(self.nsample, -1)
350        diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs])
351        total_diffs = diffs.sum()
352        ncomparisons = diffs.size
353
354        return total_diffs / ncomparisons
355
356    def get_tajimas_d(self):
357        """Returns Tajima's D"""
358
359        if self.nsample < 3:
360            return None
361
362        pre_d = self.theta_pi - self.theta_w
363        segr_sites = self.segregating_sites_gsample
364
365        if segr_sites == 0:
366            logging.info("Cannot compute Tajima's D because there are no segregating sites.")
367            return
368
369        a_1 = self.harmonic(self.nsample - 1)
370        a_2 = self.harmonic_sq(self.nsample - 1)
371        b_1 = (self.nsample + 1) / (3 * (self.nsample - 1))
372        b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1))
373        c_1 = b_1 - (1 / a_1)
374        c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2))
375        e_1 = c_1 / a_1
376        e_2 = c_2 / ((a_1**2) + a_2)
377        d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5
378
379        return pre_d / d_stdev
380
381    def get_sfs(self, reference_genome):
382        """Returns the site frequency spectrum (allele frequency spectrum) of a sample"""
383        pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0)
384        pre_sfs[np.nonzero(reference_genome)] -= self.nsample
385        pre_sfs = np.abs(pre_sfs)
386        sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1]  # TODO what if len(genomes) < sample_size
387        return sfs
388
389    def get_theta_h(self):
390        """Returns Fay and Wu's estimator theta_h"""
391        # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) )
392        sfs = self.sfs
393        t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum()
394        return t_h
395
396    def get_fayandwu_h(self):
397        """Returns Fay and Wu's H"""
398        pre_h = self.theta_pi - self.theta_h
399        h_stdev = 1  # TODO: Calculate actual variance of h
400        return pre_h / h_stdev
class PopgenStats:
 11class PopgenStats:
 12    def __init__(self):
 13        self.pop_size_history = []
 14
 15    def record_pop_size_history(self, genomes):
 16        """Records population sizes at last 1000 steps"""
 17        if len(self.pop_size_history) >= 1000:
 18            del self.pop_size_history[0]
 19        self.pop_size_history.append(len(genomes))
 20
 21    def calc(self, input_genomes, mutation_rates):
 22        """Calculates all popgen metrics
 23
 24        Set sample_size value to 0 or -1 to not perform any sampling.
 25        """
 26
 27        # Infer ploidy from genomes
 28        self.ploidy = input_genomes.shape[1]
 29
 30        # Data to process
 31        self.genomes = self.make_3D(input_genomes)
 32        self.gsample = self.get_genomes_sample()
 33        self.nsample = 0 if self.gsample is None else len(self.gsample)  # TODO correct for ploidy?
 34
 35        # Statistics on population
 36        self.n = self.get_n()
 37        self.ne = self.get_ne()
 38        self.allele_frequencies = self.get_allele_frequencies()
 39        self.genotype_frequencies = self.get_genotype_frequencies()
 40        self.mean_h_per_bit = self.get_mean_h_per_bit()
 41        self.mean_h_per_locus = self.get_mean_h_per_locus()
 42        self.mean_h = self.get_mean_h()
 43        self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected()
 44        self.mean_h_expected = self.get_mean_h_expected()
 45        self.mu = self.get_mu(mutation_rates)
 46        self.theta = self.get_theta()
 47        self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy)
 48        self.reference_genome = self.get_reference_genome(self.genomes)
 49
 50        # Statistics on sample
 51        if self.nsample:
 52            # TODO gsample is essentially haploid, check if that causes issues
 53            self.reference_genome_gsample = self.get_reference_genome(self.gsample)
 54            self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1)
 55            self.theta_w = self.get_theta_w()
 56            self.theta_pi = self.get_theta_pi()
 57            self.tajimas_d = self.get_tajimas_d()
 58            self.sfs = self.get_sfs(self.reference_genome_gsample)  # Uses reference genome calculated from sample
 59            self.theta_h = self.get_theta_h()
 60            self.fayandwu_h = self.get_fayandwu_h()
 61        else:
 62            # TODO will cause Recorder to fail?
 63            (
 64                self.reference_genome_gsample,
 65                self.segregating_sites_gsample,
 66                self.theta_w,
 67                self.theta_pi,
 68                self.tajimas_d,
 69                self.sfs,
 70                self.theta_h,
 71                self.fayandwu_h,
 72            ) = [None] * 8
 73
 74    def emit_simple(self):
 75        # TODO add headers
 76        # TODO add headers and indices to all output files; and index and column names
 77
 78        attrs = [
 79            "n",
 80            "ne",
 81            "mu",
 82            "segregating_sites",
 83            "segregating_sites_gsample",
 84            "theta",
 85            "theta_w",
 86            "theta_pi",
 87            "tajimas_d",
 88            "theta_h",
 89            "fayandwu_h",
 90        ]
 91
 92        if self.ploidy == 2:
 93            attrs += ["mean_h", "mean_h_expected"]
 94
 95        return {attr: getattr(self, attr) for attr in attrs}
 96
 97    def emit_complex(self):
 98        attrs = [
 99            "allele_frequencies",
100            "genotype_frequencies",
101            "sfs",
102            "reference_genome",
103            "reference_genome_gsample",
104        ]
105
106        if self.ploidy == 2:
107            attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"]
108
109        return {attr: getattr(self, attr) for attr in attrs}
110
111    ####################
112    # HELPER FUNCTIONS #
113    ####################
114
115    @staticmethod
116    def harmonic(i):
117        """Returns the i-th harmonic number"""
118        return (1 / np.arange(1, i + 1)).sum()
119
120    @staticmethod
121    def harmonic_sq(i):
122        """Returns the i-th harmonic square"""
123        return (1 / np.arange(1, i + 1) ** 2).sum()
124
125    @staticmethod
126    def make_3D(input_genomes):
127        """Returns genomes array with merged chromosomes
128
129        Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid,
130        the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome.
131        If individuals are haploid, all bits belong to one virtual chromosome.
132
133        This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit]
134        so that all bits from one chromosome are in a separate array than those in the second chromosome.
135        If individuals are haploid, the chromosome dimension contains only one element.
136        """
137
138        n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape
139
140        # TODO report when conversion cannot be executed
141
142        if ploidy == 1:
143            return input_genomes[:, 0]
144
145        else:
146            genomes = np.empty(
147                shape=(n_individuals, n_loci, ploidy * bits_per_locus),
148                dtype=np.bool_,
149            )
150
151            # Odd bits contain bits from chromosome 0
152            genomes[:, :, 0::2] = input_genomes[:, 0]
153
154            # Even bits contain bits from chromosome 1
155            genomes[:, :, 1::2] = input_genomes[:, 1]
156
157        return genomes
158
159    @staticmethod
160    def make_4D(genomes, ploidy):
161        """Returns genomes array with chromosomes split along the second dimension"""
162
163        n_individuals, n_loci, n_bits = genomes.shape
164        bits_per_locus = n_bits // ploidy
165
166        # TODO report when conversion cannot be executed
167
168        unstaggered = np.empty(
169            shape=(n_individuals, ploidy, n_loci, bits_per_locus),
170            dtype=np.bool_,
171        )
172
173        if ploidy == 1:
174            unstaggered[:, 0] = genomes
175        if ploidy == 2:
176            # Chromosome 0 contains odd bits from input genomes
177            unstaggered[:, 0] = genomes[:, :, 0::2]
178            # Chromosome 1 contains even bits from input genomes
179            unstaggered[:, 1] = genomes[:, :, 1::2]
180
181        return unstaggered
182
183    def get_genomes_sample(self):
184        """Returns a random sample of genomes"""
185        if self.ploidy > 1:
186            # The chromosomes get aligned
187            # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] ->
188            #
189            # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3],
190            #                      [2, 2, 2, 2, 4, 4, 4, 4]] ->
191            #
192            # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1],
193            #    (individuals * 2, ...)           [2, 2, 2, 2],
194            #                                     [3, 3, 3, 3],
195            #                                     [4, 4, 4, 4]]
196            genomes = (
197                self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1)
198            )
199        else:
200            genomes = self.genomes
201
202        # TODO check if change in ploidy has implications for popgen stats
203
204        # Check if there are enough genomes to sample
205        if len(genomes) < 2:  # NOTE tajimas_d requires minimum 2
206            return None
207
208        # Sample genomes
209        if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]:
210            indices = np.random.choice(
211                range(genomes.shape[0]),
212                parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE,
213                replace=False,
214            )
215            return genomes[indices]
216        else:
217            return genomes
218
219    ####################################
220    # OUTPUT: Statistics on population #
221    ####################################
222
223    def get_n(self):
224        """Returns the census population size N"""
225        return self.pop_size_history[-1]
226
227    def get_ne(self):
228        """Returns the effective population size Ne"""
229        return statistics.harmonic_mean(self.pop_size_history)
230
231    def get_allele_frequencies(self):
232        """Returns the frequency of the 1-allele at every position of the genome"""
233        # Aligns the genomes of the population, disregarding chromosomes, and takes the mean
234        return self.genomes.reshape(self.genomes.shape[0], -1).mean(0)
235
236    def get_genotype_frequencies(self):
237        """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]"""
238        if self.ploidy == 1:
239            return self.allele_frequencies
240
241        len_pop = self.genomes.shape[0]
242
243        # Genotype = Sum of alleles at a position -> [0, 1, 2]
244        genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose()
245
246        # Counts the frequencies of 0, 1 and 2 across the population
247        genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop
248
249        return genotype_freqs
250
251    def get_mean_h_per_bit(self):
252        """Returns the mean heterozygosity per bit.
253        Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
254        if self.ploidy == 1:
255            return None
256
257        return self.genotype_frequencies[1::3]
258
259    def get_mean_h_per_locus(self):
260        """Returns the mean heterozygosity per locus.
261        Output: [Hloc1, Hloc2, ...] Entries: nloci"""
262        if self.ploidy == 1:
263            return None
264
265        h_per_bit = self.mean_h_per_bit
266        return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1)
267
268    def get_mean_h(self):
269        """Returns the mean heterozygosity of a population.
270        Output: H"""
271        if self.ploidy == 1:
272            return None
273
274        return self.mean_h_per_bit.mean()
275
276    def get_mean_h_per_bit_expected(self):
277        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
278        Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
279        if self.ploidy == 1:
280            return None
281
282        genotype_freqs_sqrd = self.genotype_frequencies**2
283        sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1)
284        return 1 - sum_each_locus
285
286    def get_mean_h_expected(self):
287        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
288        Output: He"""
289        if self.ploidy == 1:
290            return None
291
292        return self.mean_h_per_bit_expected.mean()
293
294    def get_mu(self, mutation_rates):
295        """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene"""
296        return np.mean(mutation_rates)
297
298    def get_theta(self):
299        """Returns the adjusted mutation rate theta = 4 * Ne * µ,
300        where µ is the mutation rate per gene per generation and Ne is the effective population size"""
301        return (self.ploidy * 2) * self.ne * self.mu
302
303    ##############################################
304    # OUTPUT: Statistics on population or sample #
305    ##############################################
306
307    def get_reference_genome(self, genomes):
308        """Returns the reference genome based on which allele is most common at each position.
309        Equal fractions -> 0"""
310        return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32")
311
312    def get_segregating_sites(self, genomes, ploidy):
313        """Returns the number of segregating sites
314
315        Genomes are first aligned and summed at each site across the population.
316        A site is segregating if its sum is not equal to either 0 or the population size N.
317        """
318
319        if ploidy == 1:
320            pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0)
321            segr_sites = (
322                genomes.shape[1] * genomes.shape[2]
323                - (pre_segr_sites == genomes.shape[0]).sum()
324                - (pre_segr_sites == 0).sum()
325            )
326
327            return segr_sites
328
329        pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0)
330        segr_sites = (
331            ((genomes.shape[1] * genomes.shape[2]) // 2)
332            - (pre_segr_sites == genomes.shape[0] * 2).sum()
333            - (pre_segr_sites == 0).sum()
334        )
335        return segr_sites
336
337    ################################
338    # OUTPUT: Statistics on sample #
339    ################################
340
341    def get_theta_w(self):
342        """Returns Watterson's estimator theta_w"""
343        return self.segregating_sites_gsample / self.harmonic(self.nsample - 1)
344
345    def get_theta_pi(self):
346        """Returns the estimator theta_pi (based on pairwise differences)"""
347        combs = itertools.combinations(range(self.nsample), 2)
348
349        # Aligns chromosomes and count pairwise differences
350        genomes_sample_flat = self.gsample.reshape(self.nsample, -1)
351        diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs])
352        total_diffs = diffs.sum()
353        ncomparisons = diffs.size
354
355        return total_diffs / ncomparisons
356
357    def get_tajimas_d(self):
358        """Returns Tajima's D"""
359
360        if self.nsample < 3:
361            return None
362
363        pre_d = self.theta_pi - self.theta_w
364        segr_sites = self.segregating_sites_gsample
365
366        if segr_sites == 0:
367            logging.info("Cannot compute Tajima's D because there are no segregating sites.")
368            return
369
370        a_1 = self.harmonic(self.nsample - 1)
371        a_2 = self.harmonic_sq(self.nsample - 1)
372        b_1 = (self.nsample + 1) / (3 * (self.nsample - 1))
373        b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1))
374        c_1 = b_1 - (1 / a_1)
375        c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2))
376        e_1 = c_1 / a_1
377        e_2 = c_2 / ((a_1**2) + a_2)
378        d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5
379
380        return pre_d / d_stdev
381
382    def get_sfs(self, reference_genome):
383        """Returns the site frequency spectrum (allele frequency spectrum) of a sample"""
384        pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0)
385        pre_sfs[np.nonzero(reference_genome)] -= self.nsample
386        pre_sfs = np.abs(pre_sfs)
387        sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1]  # TODO what if len(genomes) < sample_size
388        return sfs
389
390    def get_theta_h(self):
391        """Returns Fay and Wu's estimator theta_h"""
392        # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) )
393        sfs = self.sfs
394        t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum()
395        return t_h
396
397    def get_fayandwu_h(self):
398        """Returns Fay and Wu's H"""
399        pre_h = self.theta_pi - self.theta_h
400        h_stdev = 1  # TODO: Calculate actual variance of h
401        return pre_h / h_stdev
pop_size_history
def record_pop_size_history(self, genomes):
15    def record_pop_size_history(self, genomes):
16        """Records population sizes at last 1000 steps"""
17        if len(self.pop_size_history) >= 1000:
18            del self.pop_size_history[0]
19        self.pop_size_history.append(len(genomes))

Records population sizes at last 1000 steps

def calc(self, input_genomes, mutation_rates):
21    def calc(self, input_genomes, mutation_rates):
22        """Calculates all popgen metrics
23
24        Set sample_size value to 0 or -1 to not perform any sampling.
25        """
26
27        # Infer ploidy from genomes
28        self.ploidy = input_genomes.shape[1]
29
30        # Data to process
31        self.genomes = self.make_3D(input_genomes)
32        self.gsample = self.get_genomes_sample()
33        self.nsample = 0 if self.gsample is None else len(self.gsample)  # TODO correct for ploidy?
34
35        # Statistics on population
36        self.n = self.get_n()
37        self.ne = self.get_ne()
38        self.allele_frequencies = self.get_allele_frequencies()
39        self.genotype_frequencies = self.get_genotype_frequencies()
40        self.mean_h_per_bit = self.get_mean_h_per_bit()
41        self.mean_h_per_locus = self.get_mean_h_per_locus()
42        self.mean_h = self.get_mean_h()
43        self.mean_h_per_bit_expected = self.get_mean_h_per_bit_expected()
44        self.mean_h_expected = self.get_mean_h_expected()
45        self.mu = self.get_mu(mutation_rates)
46        self.theta = self.get_theta()
47        self.segregating_sites = self.get_segregating_sites(self.genomes, self.ploidy)
48        self.reference_genome = self.get_reference_genome(self.genomes)
49
50        # Statistics on sample
51        if self.nsample:
52            # TODO gsample is essentially haploid, check if that causes issues
53            self.reference_genome_gsample = self.get_reference_genome(self.gsample)
54            self.segregating_sites_gsample = self.get_segregating_sites(self.gsample, 1)
55            self.theta_w = self.get_theta_w()
56            self.theta_pi = self.get_theta_pi()
57            self.tajimas_d = self.get_tajimas_d()
58            self.sfs = self.get_sfs(self.reference_genome_gsample)  # Uses reference genome calculated from sample
59            self.theta_h = self.get_theta_h()
60            self.fayandwu_h = self.get_fayandwu_h()
61        else:
62            # TODO will cause Recorder to fail?
63            (
64                self.reference_genome_gsample,
65                self.segregating_sites_gsample,
66                self.theta_w,
67                self.theta_pi,
68                self.tajimas_d,
69                self.sfs,
70                self.theta_h,
71                self.fayandwu_h,
72            ) = [None] * 8

Calculates all popgen metrics

Set sample_size value to 0 or -1 to not perform any sampling.

def emit_simple(self):
74    def emit_simple(self):
75        # TODO add headers
76        # TODO add headers and indices to all output files; and index and column names
77
78        attrs = [
79            "n",
80            "ne",
81            "mu",
82            "segregating_sites",
83            "segregating_sites_gsample",
84            "theta",
85            "theta_w",
86            "theta_pi",
87            "tajimas_d",
88            "theta_h",
89            "fayandwu_h",
90        ]
91
92        if self.ploidy == 2:
93            attrs += ["mean_h", "mean_h_expected"]
94
95        return {attr: getattr(self, attr) for attr in attrs}
def emit_complex(self):
 97    def emit_complex(self):
 98        attrs = [
 99            "allele_frequencies",
100            "genotype_frequencies",
101            "sfs",
102            "reference_genome",
103            "reference_genome_gsample",
104        ]
105
106        if self.ploidy == 2:
107            attrs += ["mean_h_per_bit", "mean_h_per_locus", "mean_h_per_bit_expected"]
108
109        return {attr: getattr(self, attr) for attr in attrs}
@staticmethod
def harmonic(i):
115    @staticmethod
116    def harmonic(i):
117        """Returns the i-th harmonic number"""
118        return (1 / np.arange(1, i + 1)).sum()

Returns the i-th harmonic number

@staticmethod
def harmonic_sq(i):
120    @staticmethod
121    def harmonic_sq(i):
122        """Returns the i-th harmonic square"""
123        return (1 / np.arange(1, i + 1) ** 2).sum()

Returns the i-th harmonic square

@staticmethod
def make_3D(input_genomes):
125    @staticmethod
126    def make_3D(input_genomes):
127        """Returns genomes array with merged chromosomes
128
129        Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid,
130        the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome.
131        If individuals are haploid, all bits belong to one virtual chromosome.
132
133        This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit]
134        so that all bits from one chromosome are in a separate array than those in the second chromosome.
135        If individuals are haploid, the chromosome dimension contains only one element.
136        """
137
138        n_individuals, ploidy, n_loci, bits_per_locus = input_genomes.shape
139
140        # TODO report when conversion cannot be executed
141
142        if ploidy == 1:
143            return input_genomes[:, 0]
144
145        else:
146            genomes = np.empty(
147                shape=(n_individuals, n_loci, ploidy * bits_per_locus),
148                dtype=np.bool_,
149            )
150
151            # Odd bits contain bits from chromosome 0
152            genomes[:, :, 0::2] = input_genomes[:, 0]
153
154            # Even bits contain bits from chromosome 1
155            genomes[:, :, 1::2] = input_genomes[:, 1]
156
157        return genomes

Returns genomes array with merged chromosomes

Methods of PopgenStats require the genomes to be in the form [individual, locus, bit] where, if individuals are diploid, the odd bits belong to a virtual first chromosome, while the even bits belong to a virtual second chromosome. If individuals are haploid, all bits belong to one virtual chromosome.

This is contrast with the genomes arrays in the rest of AEGIS which is in form [individual, chromosome, locus, bit] so that all bits from one chromosome are in a separate array than those in the second chromosome. If individuals are haploid, the chromosome dimension contains only one element.

@staticmethod
def make_4D(genomes, ploidy):
159    @staticmethod
160    def make_4D(genomes, ploidy):
161        """Returns genomes array with chromosomes split along the second dimension"""
162
163        n_individuals, n_loci, n_bits = genomes.shape
164        bits_per_locus = n_bits // ploidy
165
166        # TODO report when conversion cannot be executed
167
168        unstaggered = np.empty(
169            shape=(n_individuals, ploidy, n_loci, bits_per_locus),
170            dtype=np.bool_,
171        )
172
173        if ploidy == 1:
174            unstaggered[:, 0] = genomes
175        if ploidy == 2:
176            # Chromosome 0 contains odd bits from input genomes
177            unstaggered[:, 0] = genomes[:, :, 0::2]
178            # Chromosome 1 contains even bits from input genomes
179            unstaggered[:, 1] = genomes[:, :, 1::2]
180
181        return unstaggered

Returns genomes array with chromosomes split along the second dimension

def get_genomes_sample(self):
183    def get_genomes_sample(self):
184        """Returns a random sample of genomes"""
185        if self.ploidy > 1:
186            # The chromosomes get aligned
187            # 3D (individuals, loci, bits): [1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4] ->
188            #
189            # 2D (everything, 2): [[1, 1, 1, 1, 3, 3, 3, 3],
190            #                      [2, 2, 2, 2, 4, 4, 4, 4]] ->
191            #
192            # 3D (chromosomes, loci, bits // 2): [[1, 1, 1, 1],
193            #    (individuals * 2, ...)           [2, 2, 2, 2],
194            #                                     [3, 3, 3, 3],
195            #                                     [4, 4, 4, 4]]
196            genomes = (
197                self.genomes.reshape(-1, 2).transpose().reshape(self.genomes.shape[0] * 2, self.genomes.shape[1], -1)
198            )
199        else:
200            genomes = self.genomes
201
202        # TODO check if change in ploidy has implications for popgen stats
203
204        # Check if there are enough genomes to sample
205        if len(genomes) < 2:  # NOTE tajimas_d requires minimum 2
206            return None
207
208        # Sample genomes
209        if 0 < parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE <= genomes.shape[0]:
210            indices = np.random.choice(
211                range(genomes.shape[0]),
212                parametermanager.parameters.POPGENSTATS_SAMPLE_SIZE,
213                replace=False,
214            )
215            return genomes[indices]
216        else:
217            return genomes

Returns a random sample of genomes

def get_n(self):
223    def get_n(self):
224        """Returns the census population size N"""
225        return self.pop_size_history[-1]

Returns the census population size N

def get_ne(self):
227    def get_ne(self):
228        """Returns the effective population size Ne"""
229        return statistics.harmonic_mean(self.pop_size_history)

Returns the effective population size Ne

def get_allele_frequencies(self):
231    def get_allele_frequencies(self):
232        """Returns the frequency of the 1-allele at every position of the genome"""
233        # Aligns the genomes of the population, disregarding chromosomes, and takes the mean
234        return self.genomes.reshape(self.genomes.shape[0], -1).mean(0)

Returns the frequency of the 1-allele at every position of the genome

def get_genotype_frequencies(self):
236    def get_genotype_frequencies(self):
237        """Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]"""
238        if self.ploidy == 1:
239            return self.allele_frequencies
240
241        len_pop = self.genomes.shape[0]
242
243        # Genotype = Sum of alleles at a position -> [0, 1, 2]
244        genotypes_raw = self.genomes.reshape(-1, 2).sum(1).reshape(len_pop, -1).transpose()
245
246        # Counts the frequencies of 0, 1 and 2 across the population
247        genotype_freqs = np.array([np.bincount(x, minlength=3) for x in genotypes_raw]).reshape(-1) / len_pop
248
249        return genotype_freqs

Output: [loc1_bit1_freq00, loc1_bit1_freq01, loc1_bit1_freq11, loc1_bit2_freq00, ...]

def get_mean_h_per_bit(self):
251    def get_mean_h_per_bit(self):
252        """Returns the mean heterozygosity per bit.
253        Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
254        if self.ploidy == 1:
255            return None
256
257        return self.genotype_frequencies[1::3]

Returns the mean heterozygosity per bit. Output: [Hloc1_bit1, Hloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci

def get_mean_h_per_locus(self):
259    def get_mean_h_per_locus(self):
260        """Returns the mean heterozygosity per locus.
261        Output: [Hloc1, Hloc2, ...] Entries: nloci"""
262        if self.ploidy == 1:
263            return None
264
265        h_per_bit = self.mean_h_per_bit
266        return h_per_bit.reshape(-1, self.genomes.shape[2] // 2).mean(1)

Returns the mean heterozygosity per locus. Output: [Hloc1, Hloc2, ...] Entries: nloci

def get_mean_h(self):
268    def get_mean_h(self):
269        """Returns the mean heterozygosity of a population.
270        Output: H"""
271        if self.ploidy == 1:
272            return None
273
274        return self.mean_h_per_bit.mean()

Returns the mean heterozygosity of a population. Output: H

def get_mean_h_per_bit_expected(self):
276    def get_mean_h_per_bit_expected(self):
277        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
278        Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci"""
279        if self.ploidy == 1:
280            return None
281
282        genotype_freqs_sqrd = self.genotype_frequencies**2
283        sum_each_locus = genotype_freqs_sqrd.reshape(-1, 3).sum(1)
284        return 1 - sum_each_locus

Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: [Heloc1_bit1, Heloc1_bit2, ...] Entries: (bits_per_locus // 2) * nloci

def get_mean_h_expected(self):
286    def get_mean_h_expected(self):
287        """Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium.
288        Output: He"""
289        if self.ploidy == 1:
290            return None
291
292        return self.mean_h_per_bit_expected.mean()

Returns the expected mean heterozygosity per bit under Hardy-Weinberg-Equilibrium. Output: He

def get_mu(self, mutation_rates):
294    def get_mu(self, mutation_rates):
295        """Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene"""
296        return np.mean(mutation_rates)

Return the mutation rate µ per gene per generation -> AEGIS-'Locus' interpreted as a gene

def get_theta(self):
298    def get_theta(self):
299        """Returns the adjusted mutation rate theta = 4 * Ne * µ,
300        where µ is the mutation rate per gene per generation and Ne is the effective population size"""
301        return (self.ploidy * 2) * self.ne * self.mu

Returns the adjusted mutation rate theta = 4 * Ne * µ, where µ is the mutation rate per gene per generation and Ne is the effective population size

def get_reference_genome(self, genomes):
307    def get_reference_genome(self, genomes):
308        """Returns the reference genome based on which allele is most common at each position.
309        Equal fractions -> 0"""
310        return np.round(genomes.reshape(genomes.shape[0], -1).mean(0)).astype("int32")

Returns the reference genome based on which allele is most common at each position. Equal fractions -> 0

def get_segregating_sites(self, genomes, ploidy):
312    def get_segregating_sites(self, genomes, ploidy):
313        """Returns the number of segregating sites
314
315        Genomes are first aligned and summed at each site across the population.
316        A site is segregating if its sum is not equal to either 0 or the population size N.
317        """
318
319        if ploidy == 1:
320            pre_segr_sites = genomes.reshape(genomes.shape[0], -1).sum(0)
321            segr_sites = (
322                genomes.shape[1] * genomes.shape[2]
323                - (pre_segr_sites == genomes.shape[0]).sum()
324                - (pre_segr_sites == 0).sum()
325            )
326
327            return segr_sites
328
329        pre_segr_sites = genomes.reshape(-1, 2).transpose().reshape(genomes.shape[0] * 2, -1).sum(0)
330        segr_sites = (
331            ((genomes.shape[1] * genomes.shape[2]) // 2)
332            - (pre_segr_sites == genomes.shape[0] * 2).sum()
333            - (pre_segr_sites == 0).sum()
334        )
335        return segr_sites

Returns the number of segregating sites

Genomes are first aligned and summed at each site across the population. A site is segregating if its sum is not equal to either 0 or the population size N.

def get_theta_w(self):
341    def get_theta_w(self):
342        """Returns Watterson's estimator theta_w"""
343        return self.segregating_sites_gsample / self.harmonic(self.nsample - 1)

Returns Watterson's estimator theta_w

def get_theta_pi(self):
345    def get_theta_pi(self):
346        """Returns the estimator theta_pi (based on pairwise differences)"""
347        combs = itertools.combinations(range(self.nsample), 2)
348
349        # Aligns chromosomes and count pairwise differences
350        genomes_sample_flat = self.gsample.reshape(self.nsample, -1)
351        diffs = np.array([(genomes_sample_flat[i[0]] != genomes_sample_flat[i[1]]).sum() for i in combs])
352        total_diffs = diffs.sum()
353        ncomparisons = diffs.size
354
355        return total_diffs / ncomparisons

Returns the estimator theta_pi (based on pairwise differences)

def get_tajimas_d(self):
357    def get_tajimas_d(self):
358        """Returns Tajima's D"""
359
360        if self.nsample < 3:
361            return None
362
363        pre_d = self.theta_pi - self.theta_w
364        segr_sites = self.segregating_sites_gsample
365
366        if segr_sites == 0:
367            logging.info("Cannot compute Tajima's D because there are no segregating sites.")
368            return
369
370        a_1 = self.harmonic(self.nsample - 1)
371        a_2 = self.harmonic_sq(self.nsample - 1)
372        b_1 = (self.nsample + 1) / (3 * (self.nsample - 1))
373        b_2 = (2 * (self.nsample**2 + self.nsample + 3)) / (9 * self.nsample * (self.nsample - 1))
374        c_1 = b_1 - (1 / a_1)
375        c_2 = b_2 - ((self.nsample + 2) / (a_1 * self.nsample)) + (a_2 / (a_1**2))
376        e_1 = c_1 / a_1
377        e_2 = c_2 / ((a_1**2) + a_2)
378        d_stdev = ((e_1 * segr_sites) + (e_2 * segr_sites * (segr_sites - 1))) ** 0.5
379
380        return pre_d / d_stdev

Returns Tajima's D

def get_sfs(self, reference_genome):
382    def get_sfs(self, reference_genome):
383        """Returns the site frequency spectrum (allele frequency spectrum) of a sample"""
384        pre_sfs = self.gsample.reshape(self.nsample, -1).sum(0)
385        pre_sfs[np.nonzero(reference_genome)] -= self.nsample
386        pre_sfs = np.abs(pre_sfs)
387        sfs = np.bincount(pre_sfs, minlength=self.nsample + 1)[:-1]  # TODO what if len(genomes) < sample_size
388        return sfs

Returns the site frequency spectrum (allele frequency spectrum) of a sample

def get_theta_h(self):
390    def get_theta_h(self):
391        """Returns Fay and Wu's estimator theta_h"""
392        # sum from i=1 to i=n-1: ( (2 * S_i * i^2) / (n * (n-1)) )
393        sfs = self.sfs
394        t_h = ((2 * sfs * (np.arange(self.nsample) ** 2)) / (self.nsample * (self.nsample - 1))).sum()
395        return t_h

Returns Fay and Wu's estimator theta_h

def get_fayandwu_h(self):
397    def get_fayandwu_h(self):
398        """Returns Fay and Wu's H"""
399        pre_h = self.theta_pi - self.theta_h
400        h_stdev = 1  # TODO: Calculate actual variance of h
401        return pre_h / h_stdev

Returns Fay and Wu's H