From 7e63c50e72d7a2db685a771ea657481ff141c181 Mon Sep 17 00:00:00 2001
From: Shing Zhan <shing.zhan@gmail.com>
Date: Sun, 21 Jan 2024 20:06:26 +0000
Subject: [PATCH] Move docstring

---
 python/tests/beagle_numba.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
index 8a467de828..a2601c7c3f 100644
--- a/python/tests/beagle_numba.py
+++ b/python/tests/beagle_numba.py
@@ -2,7 +2,40 @@
 Implementation of the BEAGLE algorithm to impute alleles by linear interpolation of
 state probabilities at ungenotyped markers in the HMM state probability matrix.
 
-The source codes of BEAGLE 4.1 and 5.4 are closely consulted.
+This was implemented while closely consulting the BEAGLE 4.1 paper:
+Browning & Browning (2016). Genotype imputation with millions of reference samples.
+Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020
+
+The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was closely consulted:
+https://faculty.washington.edu/browning/beagle/b4_1.html
+
+These notations are used throughout this implementation:
+h = number of reference haplotypes.
+m = number of genotyped markers.
+x = number of ungenotyped markers.
+
+This implementation takes the following inputs:
+1. Panel of reference haplotypes in a matrix of size (m + x, h).
+2. One query haplotype in an array of size (m + x).
+3. Physical positions of all the markers in an array of size (m + x).
+
+In the query haplotype:
+1. The genotyped positions take values of 0, 1, 2, or 3 (ACGT encoding).
+2. The ungenotyped positions take -1.
+
+The forward and backward probability matrices are of size (m, h).
+The HMM state probability matrix is of size (m, h).
+The interpolated allele probability matrix is of size (x, 4),
+The imputed alleles are the maximum a posteriori (MAP) alleles.
+
+To improve computational efficiency, BEAGLE uses aggregated markers,
+which are clusters of markers within a 0.005 cM interval (default).
+Because the genotypes are phased, the alleles in the aggregated markers
+form distinct "allele sequences". Below, we do not use aggregated markers
+or allele sequences, which would complicate the implementation.
+
+Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm,
+this implementation uses Equation 1 of BB2016.
 """
 from dataclasses import dataclass