From 5a25a917a6abf50127ec7c617ab980b81ad533f7 Mon Sep 17 00:00:00 2001 From: szhan Date: Tue, 17 Dec 2024 15:40:19 +0000 Subject: [PATCH] Add a notebook documenting how to find candidate seeds --- notebooks/find_candidate_seeds.ipynb | 218 +++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 notebooks/find_candidate_seeds.ipynb diff --git a/notebooks/find_candidate_seeds.ipynb b/notebooks/find_candidate_seeds.ipynb new file mode 100644 index 0000000..acd5957 --- /dev/null +++ b/notebooks/find_candidate_seeds.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# See associated issue\n", + "# https://github.com/jeromekelleher/sc2ts-paper/issues/268\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download files" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "wget --quiet https://raw.githubusercontent.com/cov-lineages/pango-designation/16205e716c6a68ff1c3d0f26f0c77478682368ac/lineages.csv\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "curl -s -X 'GET' \\\n", + " 'https://www.ebi.ac.uk/ena/portal/api/filereport?result=read_run&accession=PRJEB37886&fields=sample_accession%2Csample_alias&limit=0&format=tsv&download=true' \\\n", + " -H 'accept: */*' > filereport_read_run_PRJEB37886_tsv.txt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "wget --quiet --content-disposition https://figshare.com/ndownloader/files/49694808\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Parse files\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# lineage, sample name\n", + "pango = pd.read_csv(\"lineages.csv\", sep=\",\")\n", + "pango[\"sample_name\"] = [s.split(\"/\")[1] for s in pango[\"taxon\"]]\n", + "pango.head(1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# run accession, sample name\n", + "ena = pd.read_csv(\"filereport_read_run_PRJEB37886_tsv.txt\", sep=\"\\t\")\n", + "ena[\"sample_name\"] = [s.split(\"/\")[1] for s in ena[\"sample_alias\"]]\n", + "ena.head(1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run (strain)\n", + "viridian = pd.read_csv(\"run_metadata.v05.tsv.gz\", sep=\"\\t\")\n", + "viridian = viridian[viridian[\"Date_tree\"] != \"none\"]\n", + "viridian[\"parsed_datetime\"] = pd.to_datetime(\n", + " viridian[\"Date_tree\"],\n", + " format='%Y-%m-%d',\n", + " errors='coerce',\n", + ")\n", + "viridian = viridian[viridian[\"parsed_datetime\"].notna()]\n", + "viridian.head(1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Search among the COG-UK samples" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Chosen by trial-and-error\n", + "threshold_dates_dict = {\n", + " \"B.1.617.1\": \"2021-04-01\",\n", + " \"B.1.617.2\": \"2021-04-01\",\n", + " \"BA.1\": \"2021-12-01\",\n", + " \"BA.2\": \"2022-01-08\",\n", + " \"BA.4\": \"2022-04-01\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for focal_pango, threshold_date in threshold_dates_dict.items():\n", + " out_file = \"\".join([\n", + " \"candidate_seeds\", \"_\", focal_pango, \"_\", \\\n", + " \"pre\", \"-\", threshold_date, \\\n", + " ]) + \".txt\"\n", + "\n", + " designated_samples = pango[pango[\"lineage\"] == focal_pango][\"sample_name\"]\n", + " coguk_runs = ena[ena[\"sample_name\"].isin(designated_samples)][\"run_accession\"]\n", + " viridian_samples = viridian[viridian[\"Run\"].isin(coguk_runs)]\n", + "\n", + " viridian_samples[\n", + " (viridian_samples[\"parsed_datetime\"] < pd.to_datetime(threshold_date)) & \\\n", + " (viridian_samples[\"parsed_datetime\"] != pd.to_datetime(\"2020-12-31\"))\n", + " ][[\"Run\", \"Date_tree\"]].to_csv(out_file, index=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Search among the South Africa samples for Omicron seeds" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Chosen by trial-and-error\n", + "threshold_dates_dict = {\n", + " \"BA.1\": \"2021-10-01\",\n", + " \"BA.2\": \"2021-12-01\",\n", + " \"BA.4\": \"2022-01-01\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "for focal_pango, threshold_date in threshold_dates_dict.items():\n", + " out_file = \"\".join([\n", + " \"candidate_seeds\", \"_\", focal_pango, \"_\", \\\n", + " \"SouthAfrica\", \"_\", \\\n", + " \"pre\", \"-\", threshold_date,\n", + " ]) + \".txt\"\n", + "\n", + " viridian[\n", + " (viridian[\"Viridian_pangolin\"] == focal_pango) & \\\n", + " (viridian[\"Country\"] == \"South Africa\") & \\\n", + " (viridian[\"parsed_datetime\"] < pd.to_datetime(threshold_date))\n", + " ][[\"Run\", \"Date_tree\"]].to_csv(out_file, index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}