Skip to content

Commit

Permalink
Merge pull request #124 from szhan/add_nb_haploid
Browse files Browse the repository at this point in the history
Add Jupyter notebook showing how to use haploid LS HMM algorithms
  • Loading branch information
szhan authored Jun 19, 2024
2 parents 3371ca1 + 299f855 commit a530b54
Showing 1 changed file with 229 additions and 0 deletions.
229 changes: 229 additions & 0 deletions notebooks/demo_haploid.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import msprime\n",
"import lshmm\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Input data\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First, let's simulate some haplotypes to use as a reference panel.\n",
"seed = 1208\n",
"ts = msprime.sim_mutations(\n",
" msprime.sim_ancestry(\n",
" samples=10,\n",
" ploidy=1,\n",
" sequence_length=1e4,\n",
" discrete_genome=True,\n",
" recombination_rate=1e-4,\n",
" random_seed=seed,\n",
" ),\n",
" rate=1e-4,\n",
" random_seed=seed,\n",
")\n",
"ts\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# A reference panel must be of size (number of sites, number of haplotypes).\n",
"ref_panel = ts.genotype_matrix()\n",
"\n",
"# A query must be of size (1, number of sites).\n",
"# It can contain MISSING values, but not NONCOPY values.\n",
"query = ref_panel[:, 0].reshape(1, ts.num_sites)\n",
"\n",
"# The number of distinct alleles per site is needed to get per-site mutation rates.\n",
"num_alleles = lshmm.core.get_num_alleles(ref_panel, query)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Per-site recombination and mutation probabilities are needed\n",
"# to parametrise the LS HMM.\n",
"rho = np.zeros(ts.num_sites, dtype=np.float64) + 1e-4\n",
"rho[0] = 0\n",
"mu = np.zeros(ts.num_sites, dtype=np.float64) + 1e-4\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Viterbi algorithm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's determine the best copying path and its log-likelihood using the Viterbi algorithm.\n",
"path, log_lik = lshmm.api.viterbi(\n",
" ref_panel,\n",
" query,\n",
" num_alleles,\n",
" prob_recombination=rho,\n",
" prob_mutation=mu,\n",
" scale_mutation_rate=True,\n",
")\n",
"print(f\"Parents in the path: {len(np.unique(path))}\")\n",
"print(f\"Log-likelihood: {log_lik}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# There is a function to evaluate the log-likelihood of a copying path.\n",
"# Let's check that we get the same log-likelihood of the same copying path \n",
"# under the same parameters using the function.\n",
"log_lik_2 = lshmm.api.path_loglik(\n",
" ref_panel,\n",
" query,\n",
" num_alleles,\n",
" path,\n",
" prob_recombination=rho,\n",
" prob_mutation=mu,\n",
" scale_mutation_rate=True,\n",
")\n",
"print(f\"Is the log-likelihood identical? {log_lik == log_lik_2}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We should get a different log-likelihood value if the parameters are not the same.\n",
"# Let's check the log-likelihood of the same copying path under different parameters.\n",
"log_lik_3 = lshmm.api.path_loglik(\n",
" ref_panel,\n",
" query,\n",
" num_alleles,\n",
" path,\n",
" prob_recombination=rho,\n",
" prob_mutation=mu + 1e-5, # Increase\n",
" scale_mutation_rate=True,\n",
")\n",
"print(f\"Is the log-likelihood identical? {log_lik == log_lik_3}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Forward-backward algorithm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# To calculate the forward probability matrix of a query,\n",
"# let's use the forward algorithm.\n",
"np.set_printoptions(linewidth=100, precision=4)\n",
"fwd_mat, norm_factor, log_lik = lshmm.api.forwards(\n",
" ref_panel,\n",
" query,\n",
" num_alleles=num_alleles,\n",
" prob_recombination=rho,\n",
" prob_mutation=mu,\n",
" scale_mutation_rate=True,\n",
" normalise=True,\n",
")\n",
"fwd_mat\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now, to calculate the backward probability matrix of the same query,\n",
"# let's use the backward algorithm.\n",
"np.set_printoptions(linewidth=100, precision=4)\n",
"bwd_mat = lshmm.api.backwards(\n",
" ref_panel,\n",
" query,\n",
" num_alleles,\n",
" normalisation_factor_from_forward=norm_factor,\n",
" prob_recombination=rho,\n",
" prob_mutation=mu,\n",
" scale_mutation_rate=True,\n",
")\n",
"bwd_mat\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check the integrity of the matrices.\n",
"np.allclose(np.sum(fwd_mat * bwd_mat, axis=1), 1.0)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit a530b54

Please sign in to comment.