Skip to content

Commit

Permalink
Merge pull request #269 from szhan/add_nb_find_seeds
Browse files Browse the repository at this point in the history
Add a notebook documenting how to find candidate seeds
  • Loading branch information
jeromekelleher authored Dec 17, 2024
2 parents 0d2558a + 5a25a91 commit d41c028
Showing 1 changed file with 218 additions and 0 deletions.
218 changes: 218 additions & 0 deletions notebooks/find_candidate_seeds.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# See associated issue\n",
"# https://github.com/jeromekelleher/sc2ts-paper/issues/268\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Download files"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"wget --quiet https://raw.githubusercontent.com/cov-lineages/pango-designation/16205e716c6a68ff1c3d0f26f0c77478682368ac/lineages.csv\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"curl -s -X 'GET' \\\n",
" 'https://www.ebi.ac.uk/ena/portal/api/filereport?result=read_run&accession=PRJEB37886&fields=sample_accession%2Csample_alias&limit=0&format=tsv&download=true' \\\n",
" -H 'accept: */*' > filereport_read_run_PRJEB37886_tsv.txt\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"wget --quiet --content-disposition https://figshare.com/ndownloader/files/49694808\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Parse files\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# lineage, sample name\n",
"pango = pd.read_csv(\"lineages.csv\", sep=\",\")\n",
"pango[\"sample_name\"] = [s.split(\"/\")[1] for s in pango[\"taxon\"]]\n",
"pango.head(1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# run accession, sample name\n",
"ena = pd.read_csv(\"filereport_read_run_PRJEB37886_tsv.txt\", sep=\"\\t\")\n",
"ena[\"sample_name\"] = [s.split(\"/\")[1] for s in ena[\"sample_alias\"]]\n",
"ena.head(1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Run (strain)\n",
"viridian = pd.read_csv(\"run_metadata.v05.tsv.gz\", sep=\"\\t\")\n",
"viridian = viridian[viridian[\"Date_tree\"] != \"none\"]\n",
"viridian[\"parsed_datetime\"] = pd.to_datetime(\n",
" viridian[\"Date_tree\"],\n",
" format='%Y-%m-%d',\n",
" errors='coerce',\n",
")\n",
"viridian = viridian[viridian[\"parsed_datetime\"].notna()]\n",
"viridian.head(1)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Search among the COG-UK samples"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Chosen by trial-and-error\n",
"threshold_dates_dict = {\n",
" \"B.1.617.1\": \"2021-04-01\",\n",
" \"B.1.617.2\": \"2021-04-01\",\n",
" \"BA.1\": \"2021-12-01\",\n",
" \"BA.2\": \"2022-01-08\",\n",
" \"BA.4\": \"2022-04-01\",\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"for focal_pango, threshold_date in threshold_dates_dict.items():\n",
" out_file = \"\".join([\n",
" \"candidate_seeds\", \"_\", focal_pango, \"_\", \\\n",
" \"pre\", \"-\", threshold_date, \\\n",
" ]) + \".txt\"\n",
"\n",
" designated_samples = pango[pango[\"lineage\"] == focal_pango][\"sample_name\"]\n",
" coguk_runs = ena[ena[\"sample_name\"].isin(designated_samples)][\"run_accession\"]\n",
" viridian_samples = viridian[viridian[\"Run\"].isin(coguk_runs)]\n",
"\n",
" viridian_samples[\n",
" (viridian_samples[\"parsed_datetime\"] < pd.to_datetime(threshold_date)) & \\\n",
" (viridian_samples[\"parsed_datetime\"] != pd.to_datetime(\"2020-12-31\"))\n",
" ][[\"Run\", \"Date_tree\"]].to_csv(out_file, index=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Search among the South Africa samples for Omicron seeds"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Chosen by trial-and-error\n",
"threshold_dates_dict = {\n",
" \"BA.1\": \"2021-10-01\",\n",
" \"BA.2\": \"2021-12-01\",\n",
" \"BA.4\": \"2022-01-01\",\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"for focal_pango, threshold_date in threshold_dates_dict.items():\n",
" out_file = \"\".join([\n",
" \"candidate_seeds\", \"_\", focal_pango, \"_\", \\\n",
" \"SouthAfrica\", \"_\", \\\n",
" \"pre\", \"-\", threshold_date,\n",
" ]) + \".txt\"\n",
"\n",
" viridian[\n",
" (viridian[\"Viridian_pangolin\"] == focal_pango) & \\\n",
" (viridian[\"Country\"] == \"South Africa\") & \\\n",
" (viridian[\"parsed_datetime\"] < pd.to_datetime(threshold_date))\n",
" ][[\"Run\", \"Date_tree\"]].to_csv(out_file, index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit d41c028

Please sign in to comment.