From 7e63c50e72d7a2db685a771ea657481ff141c181 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Sun, 21 Jan 2024 20:06:26 +0000 Subject: [PATCH] Move docstring --- python/tests/beagle_numba.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py index 8a467de828..a2601c7c3f 100644 --- a/python/tests/beagle_numba.py +++ b/python/tests/beagle_numba.py @@ -2,7 +2,40 @@ Implementation of the BEAGLE algorithm to impute alleles by linear interpolation of state probabilities at ungenotyped markers in the HMM state probability matrix. -The source codes of BEAGLE 4.1 and 5.4 are closely consulted. +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was closely consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +These notations are used throughout this implementation: +h = number of reference haplotypes. +m = number of genotyped markers. +x = number of ungenotyped markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Physical positions of all the markers in an array of size (m + x). + +In the query haplotype: +1. The genotyped positions take values of 0, 1, 2, or 3 (ACGT encoding). +2. The ungenotyped positions take -1. + +The forward and backward probability matrices are of size (m, h). +The HMM state probability matrix is of size (m, h). +The interpolated allele probability matrix is of size (x, 4), +The imputed alleles are the maximum a posteriori (MAP) alleles. + +To improve computational efficiency, BEAGLE uses aggregated markers, +which are clusters of markers within a 0.005 cM interval (default). +Because the genotypes are phased, the alleles in the aggregated markers +form distinct "allele sequences". Below, we do not use aggregated markers +or allele sequences, which would complicate the implementation. + +Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, +this implementation uses Equation 1 of BB2016. """ from dataclasses import dataclass