From 299f855d457546b1a1cf0f713bf708e9c9bccfb9 Mon Sep 17 00:00:00 2001 From: szhan Date: Wed, 19 Jun 2024 15:55:54 +0100 Subject: [PATCH] Add Jupyter notebook showing how to use haploid LS HMM algorithms --- notebooks/demo_haploid.ipynb | 229 +++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 notebooks/demo_haploid.ipynb diff --git a/notebooks/demo_haploid.ipynb b/notebooks/demo_haploid.ipynb new file mode 100644 index 0000000..aacc13c --- /dev/null +++ b/notebooks/demo_haploid.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import msprime\n", + "import lshmm\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First, let's simulate some haplotypes to use as a reference panel.\n", + "seed = 1208\n", + "ts = msprime.sim_mutations(\n", + " msprime.sim_ancestry(\n", + " samples=10,\n", + " ploidy=1,\n", + " sequence_length=1e4,\n", + " discrete_genome=True,\n", + " recombination_rate=1e-4,\n", + " random_seed=seed,\n", + " ),\n", + " rate=1e-4,\n", + " random_seed=seed,\n", + ")\n", + "ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A reference panel must be of size (number of sites, number of haplotypes).\n", + "ref_panel = ts.genotype_matrix()\n", + "\n", + "# A query must be of size (1, number of sites).\n", + "# It can contain MISSING values, but not NONCOPY values.\n", + "query = ref_panel[:, 0].reshape(1, ts.num_sites)\n", + "\n", + "# The number of distinct alleles per site is needed to get per-site mutation rates.\n", + "num_alleles = lshmm.core.get_num_alleles(ref_panel, query)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Per-site recombination and mutation probabilities are needed\n", + "# to parametrise the LS HMM.\n", + "rho = np.zeros(ts.num_sites, dtype=np.float64) + 1e-4\n", + "rho[0] = 0\n", + "mu = np.zeros(ts.num_sites, dtype=np.float64) + 1e-4\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Viterbi algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's determine the best copying path and its log-likelihood using the Viterbi algorithm.\n", + "path, log_lik = lshmm.api.viterbi(\n", + " ref_panel,\n", + " query,\n", + " num_alleles,\n", + " prob_recombination=rho,\n", + " prob_mutation=mu,\n", + " scale_mutation_rate=True,\n", + ")\n", + "print(f\"Parents in the path: {len(np.unique(path))}\")\n", + "print(f\"Log-likelihood: {log_lik}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# There is a function to evaluate the log-likelihood of a copying path.\n", + "# Let's check that we get the same log-likelihood of the same copying path \n", + "# under the same parameters using the function.\n", + "log_lik_2 = lshmm.api.path_loglik(\n", + " ref_panel,\n", + " query,\n", + " num_alleles,\n", + " path,\n", + " prob_recombination=rho,\n", + " prob_mutation=mu,\n", + " scale_mutation_rate=True,\n", + ")\n", + "print(f\"Is the log-likelihood identical? {log_lik == log_lik_2}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should get a different log-likelihood value if the parameters are not the same.\n", + "# Let's check the log-likelihood of the same copying path under different parameters.\n", + "log_lik_3 = lshmm.api.path_loglik(\n", + " ref_panel,\n", + " query,\n", + " num_alleles,\n", + " path,\n", + " prob_recombination=rho,\n", + " prob_mutation=mu + 1e-5, # Increase\n", + " scale_mutation_rate=True,\n", + ")\n", + "print(f\"Is the log-likelihood identical? {log_lik == log_lik_3}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Forward-backward algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To calculate the forward probability matrix of a query,\n", + "# let's use the forward algorithm.\n", + "np.set_printoptions(linewidth=100, precision=4)\n", + "fwd_mat, norm_factor, log_lik = lshmm.api.forwards(\n", + " ref_panel,\n", + " query,\n", + " num_alleles=num_alleles,\n", + " prob_recombination=rho,\n", + " prob_mutation=mu,\n", + " scale_mutation_rate=True,\n", + " normalise=True,\n", + ")\n", + "fwd_mat\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now, to calculate the backward probability matrix of the same query,\n", + "# let's use the backward algorithm.\n", + "np.set_printoptions(linewidth=100, precision=4)\n", + "bwd_mat = lshmm.api.backwards(\n", + " ref_panel,\n", + " query,\n", + " num_alleles,\n", + " normalisation_factor_from_forward=norm_factor,\n", + " prob_recombination=rho,\n", + " prob_mutation=mu,\n", + " scale_mutation_rate=True,\n", + ")\n", + "bwd_mat\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the integrity of the matrices.\n", + "np.allclose(np.sum(fwd_mat * bwd_mat, axis=1), 1.0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}