From 956cb1773142db6bddbc0b55a1cdb0d384f7a18d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 14:56:25 +0200 Subject: [PATCH 01/20] add first draft of the tutorial --- docs/conf.py | 56 +- docs/index.md | 6 +- docs/tutorial/analyse_binette_result.ipynb | 1688 ++++++++++++++++++++ docs/tutorial/assembly.md | 43 + docs/tutorial/binette.md | 7 + docs/tutorial/binning.md | 67 + docs/tutorial/set_env_and_get_data.md | 78 + docs/tutorial/tutorial_main.md | 52 + pyproject.toml | 7 +- 9 files changed, 1996 insertions(+), 8 deletions(-) create mode 100644 docs/tutorial/analyse_binette_result.ipynb create mode 100644 docs/tutorial/assembly.md create mode 100644 docs/tutorial/binette.md create mode 100644 docs/tutorial/binning.md create mode 100644 docs/tutorial/set_env_and_get_data.md create mode 100644 docs/tutorial/tutorial_main.md diff --git a/docs/conf.py b/docs/conf.py index b8e839a..bb2e303 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,28 +18,37 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ - "myst_parser", + # # "sphinxcontrib.jquery", "sphinx.ext.duration", "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", - 'sphinx_search.extension' + 'sphinx_search.extension', + # "myst_nb", + "myst_parser", + 'nbsphinx', + 'nbsphinx_link', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + "myst_parser", + ] source_suffix = { - '.md': 'markdown' + '.md': 'markdown', } templates_path = ['_templates'] - +nb_execution_mode = "off" +nbsphinx_execute = 'never' # Prefix document path to section labels, to use: # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api"] @@ -55,3 +64,40 @@ + + +# Include the Plotly JavaScript in the HTML output +nbsphinx_requirejs_path = "" + +# Ensures that the `require.js` is loaded for Plotly to function correctly +nbsphinx_requirejs_options = { + 'paths': { + 'plotly': 'https://cdn.plot.ly/plotly-latest.min' + }, + 'shim': { + 'plotly': { + 'exports': 'Plotly' + } + } +} + +# Specify the default language for syntax highlighting in Sphinx +highlight_language = 'python' + +# -- Options for HTML output ------------------------------------------------- + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add plotly renderer options +nbsphinx_prolog = r""" +.. raw:: html + + +""" + + + diff --git a/docs/index.md b/docs/index.md index 45dd4a6..e7ec04b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,14 +31,18 @@ Binette is inspired from the metaWRAP bin-refinement tool but it effectively sol - Enhanced Speed: Binette significantly improves the speed of the refinement process. It achieves this by launching the initial steps of CheckM2, such as Prodigal and Diamond runs, only once on all contigs. These intermediate results are then utilized to assess the quality of any given bin, eliminating redundant computations and accelerating the refinement process. - No Limit on Input Bin Sets: Unlike its predecessor, Binette is not constrained by the number of input bin sets. It can handle and process multiple bin sets simultaneously. + + + ```{toctree} :caption: 'Documentation' :maxdepth: 2 installation usage +tutorial/tutorial_main contributing -tests.md +tests api/api_ref ``` diff --git a/docs/tutorial/analyse_binette_result.ipynb b/docs/tutorial/analyse_binette_result.ipynb new file mode 100644 index 0000000..30247a8 --- /dev/null +++ b/docs/tutorial/analyse_binette_result.ipynb @@ -0,0 +1,1688 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "52e7f39c", + "metadata": {}, + "source": [ + "## Analyse Binette results" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e6a1e1ee-681d-4823-b974-7027bafd2ba9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import plotly.express as px\n", + "import plotly.io as pio\n", + "pio.renderers.default = \"sphinx_gallery\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "34e80119-f59b-41b0-b0e5-de2d6ed0c6a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bin_idoriginnamecompletenesscontaminationscoresizeN50contig_counttoolindex
017075diff44 - 10100.000.0599.9046726658208493binette0
139427diff36 - 699.900.2099.5027966054115198binette1
247060union58 | 3398.590.8396.93460133641016165binette2
347177union91 | 25 | 5596.100.3495.42259871811891312binette3
421248diff65 - 8 - 2891.981.7188.5617680959976250binette4
544137diff76 - 13 - 2892.632.4187.8137262545669850binette5
631703diff31 - 7 - 6181.730.8480.0516652338518248binette6
713475diff47 - 3772.892.3968.1112418295061252binette7
847926union75 | 3074.314.2665.79329394929541262binette8
946775union42 | 10262.942.7557.4412935713783419binette9
1033569diff83 - 7 - 38 - 3159.182.2454.7020425274437514binette10
1139350diff57 - 16 - 7552.161.3149.5426012825332509binette11
1239558diff78 - 6 - 4364.638.0348.57185821014301293binette12
1351082union120 | 152.335.0642.216888791446472binette13
1419689diff118 - 18 - 61 - 3148.228.2331.76178267614021265binette14
\n", + "
" + ], + "text/plain": [ + " bin_id origin name completeness contamination score \\\n", + "0 17075 diff 44 - 10 100.00 0.05 99.90 \n", + "1 39427 diff 36 - 6 99.90 0.20 99.50 \n", + "2 47060 union 58 | 33 98.59 0.83 96.93 \n", + "3 47177 union 91 | 25 | 55 96.10 0.34 95.42 \n", + "4 21248 diff 65 - 8 - 28 91.98 1.71 88.56 \n", + "5 44137 diff 76 - 13 - 28 92.63 2.41 87.81 \n", + "6 31703 diff 31 - 7 - 61 81.73 0.84 80.05 \n", + "7 13475 diff 47 - 37 72.89 2.39 68.11 \n", + "8 47926 union 75 | 30 74.31 4.26 65.79 \n", + "9 46775 union 42 | 102 62.94 2.75 57.44 \n", + "10 33569 diff 83 - 7 - 38 - 31 59.18 2.24 54.70 \n", + "11 39350 diff 57 - 16 - 75 52.16 1.31 49.54 \n", + "12 39558 diff 78 - 6 - 43 64.63 8.03 48.57 \n", + "13 51082 union 120 | 1 52.33 5.06 42.21 \n", + "14 19689 diff 118 - 18 - 61 - 31 48.22 8.23 31.76 \n", + "\n", + " size N50 contig_count tool index \n", + "0 4672665 82084 93 binette 0 \n", + "1 2796605 41151 98 binette 1 \n", + "2 4601336 41016 165 binette 2 \n", + "3 2598718 11891 312 binette 3 \n", + "4 1768095 9976 250 binette 4 \n", + "5 3726254 5669 850 binette 5 \n", + "6 1665233 8518 248 binette 6 \n", + "7 1241829 5061 252 binette 7 \n", + "8 3293949 2954 1262 binette 8 \n", + "9 1293571 3783 419 binette 9 \n", + "10 2042527 4437 514 binette 10 \n", + "11 2601282 5332 509 binette 11 \n", + "12 1858210 1430 1293 binette 12 \n", + "13 688879 1446 472 binette 13 \n", + "14 1782676 1402 1265 binette 14 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "binette_result_file = \"./binette_results/final_bins_quality_reports.tsv\"\n", + "df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", + "df_binette['tool'] = \"binette\"\n", + "df_binette['index'] = df_binette.index\n", + "df_binette" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "189038d3-77a0-435a-9590-4d8b3038341e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
completenesscontaminationtool
0100.000.05binette
199.900.20binette
298.590.83binette
396.100.34binette
491.981.71binette
............
208.280.01semibin2
218.120.02semibin2
227.740.01semibin2
236.180.00semibin2
244.410.13semibin2
\n", + "

140 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " completeness contamination tool\n", + "0 100.00 0.05 binette\n", + "1 99.90 0.20 binette\n", + "2 98.59 0.83 binette\n", + "3 96.10 0.34 binette\n", + "4 91.98 1.71 binette\n", + ".. ... ... ...\n", + "20 8.28 0.01 semibin2\n", + "21 8.12 0.02 semibin2\n", + "22 7.74 0.01 semibin2\n", + "23 6.18 0.00 semibin2\n", + "24 4.41 0.13 semibin2\n", + "\n", + "[140 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_bins_quality_reports_dir = Path(\"binette_results/input_bins_quality_reports/\")\n", + "\n", + "df_input_bin_list = [df_binette]\n", + "for input_bin_metric_file in input_bins_quality_reports_dir.glob(\"*tsv\"):\n", + " tool = input_bin_metric_file.name.split('.')[1].split('_')[0]\n", + " df_input = pd.read_csv(input_bin_metric_file, sep='\\t')\n", + " df_input['index'] = df_input.index\n", + " df_input['tool'] = tool\n", + " df_input_bin_list.append(df_input)\n", + "\n", + "df_bins = pd.concat(df_input_bin_list)\n", + " \n", + "set(df_bins['tool'])\n", + "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", + "#df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", + "#df_binette\n", + "df_bins[[\"completeness\", \"contamination\", \"tool\"]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "911d598f-a6c7-4178-aff2-6059235e7fc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = px.scatter(df_bins, x=\"completeness\",y=\"contamination\", color=\"High quality bin\", size=\"size\", facet_row=\"tool\")\n", + "fig.update_layout(\n", + " width=800,\n", + " height=800)\n", + " \n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "35c46beb-1ac9-4014-9672-91edcc1bf439", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2*df_bins['contamination']\n", + "fig = px.line(df_bins, x=\"index\",y='completeness - 2*contamination', color=\"tool\",markers=True)\n", + "fig.update_layout(\n", + " width=800,\n", + " height=500)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af74bfb2-457c-4cf4-9c13-3ee9642be7ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bin_idoriginnamecompletenesscontaminationscoresizeN50contig_counttoolindexHigh quality bincompleteness - 2*contaminationContamination ≤ 10 and<br>Completeness
017075diff44 - 10100.000.0599.9046726658208493binette0True99.90> 90%
139427diff36 - 699.900.2099.5027966054115198binette1True99.50> 90%
247060union58 | 3398.590.8396.93460133641016165binette2True96.93> 90%
347177union91 | 25 | 5596.100.3495.42259871811891312binette3True95.42> 90%
421248diff65 - 8 - 2891.981.7188.5617680959976250binette4True88.56> 90%
544137diff76 - 13 - 2892.632.4187.8137262545669850binette5True87.81> 90%
631703diff31 - 7 - 6181.730.8480.0516652338518248binette6False80.05> 70% and ≤ 90%
713475diff47 - 3772.892.3968.1112418295061252binette7False68.11> 70% and ≤ 90%
847926union75 | 3074.314.2665.79329394929541262binette8False65.79> 70% and ≤ 90%
946775union42 | 10262.942.7557.4412935713783419binette9False57.44> 50% and ≤ 70%
1033569diff83 - 7 - 38 - 3159.182.2454.7020425274437514binette10False54.70> 50% and ≤ 70%
1139350diff57 - 16 - 7552.161.3149.5426012825332509binette11False49.54> 50% and ≤ 70%
1239558diff78 - 6 - 4364.638.0348.57185821014301293binette12False48.57> 50% and ≤ 70%
1351082union120 | 152.335.0642.216888791446472binette13False42.21> 50% and ≤ 70%
0125concoct/bins9.fa100.000.3899.24303358637523131concoct0True99.24> 90%
167concoct/bins41.fa100.000.4699.08476546682084101concoct1True99.08> 90%
291concoct/bins7.fa92.760.3492.08227495112187265concoct2True92.08> 90%
376concoct/bins6.fa92.633.4285.7937519505674855concoct3True85.79> 90%
465concoct/bins62.fa87.351.8083.75191785910911259concoct4False83.75> 70% and ≤ 90%
675concoct/bins48.fa73.354.2664.83328537429501261concoct6False64.83> 70% and ≤ 90%
022maxbin2maxbin2.001.fasta99.814.8190.19461681889436133maxbin20True90.19> 90%
114maxbin2maxbin2.002.fasta93.923.5386.86287437337523195maxbin21True86.86> 90%
35maxbin2maxbin2.009.fasta62.698.1446.4124384926141604maxbin23False46.41> 50% and ≤ 70%
036metabat2metabat2.14.fa99.900.2499.4227995724115199metabat20True99.42> 90%
125metabat2metabat2.8.fa93.170.2292.73214809712225226metabat21True92.73> 90%
233metabat2metabat2.12.fa93.520.9291.68426613439217157metabat22True91.68> 90%
327metabat2metabat2.11.fa84.401.5381.34190276111352218metabat23False81.34> 70% and ≤ 90%
437metabat2metabat2.1.fa84.992.7379.5329805266876502metabat24False79.53> 70% and ≤ 90%
531metabat2metabat2.2.fa83.213.1676.8918070287852274metabat25False76.89> 70% and ≤ 90%
635metabat2metabat2.4.fa76.530.1176.3134776368208471metabat26False76.31> 70% and ≤ 90%
729metabat2metabat2.7.fa71.785.7760.2413846534937292metabat27False60.24> 70% and ≤ 90%
824metabat2metabat2.3.fa51.752.9945.7717070784929362metabat28False45.77> 50% and ≤ 70%
044semibin2/output_binsSemiBin_27.fa.gz100.000.0999.8246813698208494semibin20True99.82> 90%
153semibin2/output_binsSemiBin_33.fa.gz99.920.2899.36293767837523113semibin21True99.36> 90%
250semibin2/output_binsSemiBin_10.fa.gz93.430.1493.15212929512519216semibin22True93.15> 90%
362semibin2/output_binsSemiBin_24.fa.gz92.130.0392.07416291140395139semibin23True92.07> 90%
438semibin2/output_binsSemiBin_26.fa.gz83.092.2578.5916741568389245semibin24False78.59> 70% and ≤ 90%
549semibin2/output_binsSemiBin_32.fa.gz81.871.6678.55182007311737205semibin25False78.55> 70% and ≤ 90%
660semibin2/output_binsSemiBin_22.fa.gz80.251.6376.9927909487117450semibin26False76.99> 70% and ≤ 90%
747semibin2/output_binsSemiBin_11.fa.gz72.572.4567.6712450315061253semibin27False67.67> 70% and ≤ 90%
861semibin2/output_binsSemiBin_3.fa.gz53.341.3350.6817286904913367semibin28False50.68> 50% and ≤ 70%
957semibin2/output_binsSemiBin_12.fa.gz51.921.3149.3026094515292511semibin29False49.30> 50% and ≤ 70%
\n", + "
" + ], + "text/plain": [ + " bin_id origin name completeness \\\n", + "0 17075 diff 44 - 10 100.00 \n", + "1 39427 diff 36 - 6 99.90 \n", + "2 47060 union 58 | 33 98.59 \n", + "3 47177 union 91 | 25 | 55 96.10 \n", + "4 21248 diff 65 - 8 - 28 91.98 \n", + "5 44137 diff 76 - 13 - 28 92.63 \n", + "6 31703 diff 31 - 7 - 61 81.73 \n", + "7 13475 diff 47 - 37 72.89 \n", + "8 47926 union 75 | 30 74.31 \n", + "9 46775 union 42 | 102 62.94 \n", + "10 33569 diff 83 - 7 - 38 - 31 59.18 \n", + "11 39350 diff 57 - 16 - 75 52.16 \n", + "12 39558 diff 78 - 6 - 43 64.63 \n", + "13 51082 union 120 | 1 52.33 \n", + "0 125 concoct/bins 9.fa 100.00 \n", + "1 67 concoct/bins 41.fa 100.00 \n", + "2 91 concoct/bins 7.fa 92.76 \n", + "3 76 concoct/bins 6.fa 92.63 \n", + "4 65 concoct/bins 62.fa 87.35 \n", + "6 75 concoct/bins 48.fa 73.35 \n", + "0 22 maxbin2 maxbin2.001.fasta 99.81 \n", + "1 14 maxbin2 maxbin2.002.fasta 93.92 \n", + "3 5 maxbin2 maxbin2.009.fasta 62.69 \n", + "0 36 metabat2 metabat2.14.fa 99.90 \n", + "1 25 metabat2 metabat2.8.fa 93.17 \n", + "2 33 metabat2 metabat2.12.fa 93.52 \n", + "3 27 metabat2 metabat2.11.fa 84.40 \n", + "4 37 metabat2 metabat2.1.fa 84.99 \n", + "5 31 metabat2 metabat2.2.fa 83.21 \n", + "6 35 metabat2 metabat2.4.fa 76.53 \n", + "7 29 metabat2 metabat2.7.fa 71.78 \n", + "8 24 metabat2 metabat2.3.fa 51.75 \n", + "0 44 semibin2/output_bins SemiBin_27.fa.gz 100.00 \n", + "1 53 semibin2/output_bins SemiBin_33.fa.gz 99.92 \n", + "2 50 semibin2/output_bins SemiBin_10.fa.gz 93.43 \n", + "3 62 semibin2/output_bins SemiBin_24.fa.gz 92.13 \n", + "4 38 semibin2/output_bins SemiBin_26.fa.gz 83.09 \n", + "5 49 semibin2/output_bins SemiBin_32.fa.gz 81.87 \n", + "6 60 semibin2/output_bins SemiBin_22.fa.gz 80.25 \n", + "7 47 semibin2/output_bins SemiBin_11.fa.gz 72.57 \n", + "8 61 semibin2/output_bins SemiBin_3.fa.gz 53.34 \n", + "9 57 semibin2/output_bins SemiBin_12.fa.gz 51.92 \n", + "\n", + " contamination score size N50 contig_count tool index \\\n", + "0 0.05 99.90 4672665 82084 93 binette 0 \n", + "1 0.20 99.50 2796605 41151 98 binette 1 \n", + "2 0.83 96.93 4601336 41016 165 binette 2 \n", + "3 0.34 95.42 2598718 11891 312 binette 3 \n", + "4 1.71 88.56 1768095 9976 250 binette 4 \n", + "5 2.41 87.81 3726254 5669 850 binette 5 \n", + "6 0.84 80.05 1665233 8518 248 binette 6 \n", + "7 2.39 68.11 1241829 5061 252 binette 7 \n", + "8 4.26 65.79 3293949 2954 1262 binette 8 \n", + "9 2.75 57.44 1293571 3783 419 binette 9 \n", + "10 2.24 54.70 2042527 4437 514 binette 10 \n", + "11 1.31 49.54 2601282 5332 509 binette 11 \n", + "12 8.03 48.57 1858210 1430 1293 binette 12 \n", + "13 5.06 42.21 688879 1446 472 binette 13 \n", + "0 0.38 99.24 3033586 37523 131 concoct 0 \n", + "1 0.46 99.08 4765466 82084 101 concoct 1 \n", + "2 0.34 92.08 2274951 12187 265 concoct 2 \n", + "3 3.42 85.79 3751950 5674 855 concoct 3 \n", + "4 1.80 83.75 1917859 10911 259 concoct 4 \n", + "6 4.26 64.83 3285374 2950 1261 concoct 6 \n", + "0 4.81 90.19 4616818 89436 133 maxbin2 0 \n", + "1 3.53 86.86 2874373 37523 195 maxbin2 1 \n", + "3 8.14 46.41 2438492 6141 604 maxbin2 3 \n", + "0 0.24 99.42 2799572 41151 99 metabat2 0 \n", + "1 0.22 92.73 2148097 12225 226 metabat2 1 \n", + "2 0.92 91.68 4266134 39217 157 metabat2 2 \n", + "3 1.53 81.34 1902761 11352 218 metabat2 3 \n", + "4 2.73 79.53 2980526 6876 502 metabat2 4 \n", + "5 3.16 76.89 1807028 7852 274 metabat2 5 \n", + "6 0.11 76.31 3477636 82084 71 metabat2 6 \n", + "7 5.77 60.24 1384653 4937 292 metabat2 7 \n", + "8 2.99 45.77 1707078 4929 362 metabat2 8 \n", + "0 0.09 99.82 4681369 82084 94 semibin2 0 \n", + "1 0.28 99.36 2937678 37523 113 semibin2 1 \n", + "2 0.14 93.15 2129295 12519 216 semibin2 2 \n", + "3 0.03 92.07 4162911 40395 139 semibin2 3 \n", + "4 2.25 78.59 1674156 8389 245 semibin2 4 \n", + "5 1.66 78.55 1820073 11737 205 semibin2 5 \n", + "6 1.63 76.99 2790948 7117 450 semibin2 6 \n", + "7 2.45 67.67 1245031 5061 253 semibin2 7 \n", + "8 1.33 50.68 1728690 4913 367 semibin2 8 \n", + "9 1.31 49.30 2609451 5292 511 semibin2 9 \n", + "\n", + " High quality bin completeness - 2*contamination \\\n", + "0 True 99.90 \n", + "1 True 99.50 \n", + "2 True 96.93 \n", + "3 True 95.42 \n", + "4 True 88.56 \n", + "5 True 87.81 \n", + "6 False 80.05 \n", + "7 False 68.11 \n", + "8 False 65.79 \n", + "9 False 57.44 \n", + "10 False 54.70 \n", + "11 False 49.54 \n", + "12 False 48.57 \n", + "13 False 42.21 \n", + "0 True 99.24 \n", + "1 True 99.08 \n", + "2 True 92.08 \n", + "3 True 85.79 \n", + "4 False 83.75 \n", + "6 False 64.83 \n", + "0 True 90.19 \n", + "1 True 86.86 \n", + "3 False 46.41 \n", + "0 True 99.42 \n", + "1 True 92.73 \n", + "2 True 91.68 \n", + "3 False 81.34 \n", + "4 False 79.53 \n", + "5 False 76.89 \n", + "6 False 76.31 \n", + "7 False 60.24 \n", + "8 False 45.77 \n", + "0 True 99.82 \n", + "1 True 99.36 \n", + "2 True 93.15 \n", + "3 True 92.07 \n", + "4 False 78.59 \n", + "5 False 78.55 \n", + "6 False 76.99 \n", + "7 False 67.67 \n", + "8 False 50.68 \n", + "9 False 49.30 \n", + "\n", + " Contamination ≤ 10 and
Completeness \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 90% \n", + "5 > 90% \n", + "6 > 70% and ≤ 90% \n", + "7 > 70% and ≤ 90% \n", + "8 > 70% and ≤ 90% \n", + "9 > 50% and ≤ 70% \n", + "10 > 50% and ≤ 70% \n", + "11 > 50% and ≤ 70% \n", + "12 > 50% and ≤ 70% \n", + "13 > 50% and ≤ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 70% and ≤ 90% \n", + "6 > 70% and ≤ 90% \n", + "0 > 90% \n", + "1 > 90% \n", + "3 > 50% and ≤ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 70% and ≤ 90% \n", + "4 > 70% and ≤ 90% \n", + "5 > 70% and ≤ 90% \n", + "6 > 70% and ≤ 90% \n", + "7 > 70% and ≤ 90% \n", + "8 > 50% and ≤ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 70% and ≤ 90% \n", + "5 > 70% and ≤ 90% \n", + "6 > 70% and ≤ 90% \n", + "7 > 70% and ≤ 90% \n", + "8 > 50% and ≤ 70% \n", + "9 > 50% and ≤ 70% " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contamination_cutoff = 10\n", + "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", + "high_completeness_filt = df_bins['completeness'] > 90\n", + "medium_completeness_filt = df_bins['completeness'] > 70\n", + "low_completeness_filt = df_bins['completeness'] > 50\n", + "\n", + "quality = f'Contamination ≤ {contamination_cutoff} and
Completeness'\n", + "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≤ 70%'\n", + "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≤ 90%'\n", + "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", + "df_bins.loc[~df_bins[quality].isna()]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fa71ff37-9846-4826-a4bb-6c4b0069cea0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Contamination ≤ 10 and<br>Completenesstoolbin_count
0> 50% and ≤ 70%binette5
1> 50% and ≤ 70%maxbin21
2> 50% and ≤ 70%metabat21
3> 50% and ≤ 70%semibin22
4> 70% and ≤ 90%binette3
5> 70% and ≤ 90%concoct2
6> 70% and ≤ 90%metabat25
7> 70% and ≤ 90%semibin24
8> 90%binette6
9> 90%concoct4
10> 90%maxbin22
11> 90%metabat23
12> 90%semibin24
\n", + "
" + ], + "text/plain": [ + " Contamination ≤ 10 and
Completeness tool bin_count\n", + "0 > 50% and ≤ 70% binette 5\n", + "1 > 50% and ≤ 70% maxbin2 1\n", + "2 > 50% and ≤ 70% metabat2 1\n", + "3 > 50% and ≤ 70% semibin2 2\n", + "4 > 70% and ≤ 90% binette 3\n", + "5 > 70% and ≤ 90% concoct 2\n", + "6 > 70% and ≤ 90% metabat2 5\n", + "7 > 70% and ≤ 90% semibin2 4\n", + "8 > 90% binette 6\n", + "9 > 90% concoct 4\n", + "10 > 90% maxbin2 2\n", + "11 > 90% metabat2 3\n", + "12 > 90% semibin2 4" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_bins.groupby([quality, 'tool']).value_counts(ascending=True).reset_index()\n", + "\n", + "df_bins_quality_grouped = df_bins.groupby([quality, 'tool']).agg(bin_count=('bin_id', 'count')).reset_index()\n", + "df_bins_quality_grouped" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "250def29-167e-4a3b-8194-282f602945c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "color_discrete_map={\"> 90%\": px.colors.qualitative.Prism[4],\n", + " \"> 70% and ≤ 90%\": px.colors.qualitative.Prism[2],\n", + " \"> 50% and ≤ 70%\": px.colors.qualitative.Prism[6]}\n", + "\n", + "fig = px.bar(df_bins_quality_grouped, x='tool', y=\"bin_count\", color=quality,\n", + " barmode='stack', color_discrete_map=color_discrete_map, text=\"bin_count\",\n", + " category_orders={\"tool\":[\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", + " opacity = 0.9)#[ \"#008c8a\", px.colors.qualitative.Safe[4], '#2596be'])\n", + "\n", + "fig.update_layout(\n", + " width=800,\n", + " height=500,\n", + " legend=dict(\n", + " traceorder=\"reversed\",\n", + " ))\n", + "fig" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md new file mode 100644 index 0000000..cd51e76 --- /dev/null +++ b/docs/tutorial/assembly.md @@ -0,0 +1,43 @@ + + +## Assemble the reads + +We will use megahit to assemble the reads + +```bash + +cd /home/jmainguy/Analysis/Binette/Binette_tutorial/ncezid-biome_datasets/exec_tutorial_jupyter +``` + +```bash + +megahit -1 coal-metagenomics/Kickstart_1.fastq.gz -2 coal-metagenomics/Kickstart_2.fastq.gz --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 + +``` + + +This take 27m49,879s + +```{note} +We can use spade as well. It performs generally better that megahit but is generally longer and consume more memory than megahit. See cami benchmark ??? +``` + + + +## Align the reads over the assembly + +First we need to map the reads back against the assembly to get coverage information + +```bash + +mkdir -p alignments_bwa/ + +bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa + +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | samtools view -@ 12 -bS - | samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam + +samtools index alignments_bwa/Kickstart.bam + +``` + +This take around 12 minutes \ No newline at end of file diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md new file mode 100644 index 0000000..687398f --- /dev/null +++ b/docs/tutorial/binette.md @@ -0,0 +1,7 @@ +## Run Binette + +```{code-cell} bash + +binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ -c Kickstart.megahit/R1.contigs.fa --verbose -t 12 -o binette_results + +``` \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md new file mode 100644 index 0000000..36efbed --- /dev/null +++ b/docs/tutorial/binning.md @@ -0,0 +1,67 @@ + +## Run binning tools + + +### metabat2 + +We first generate a depth file from the bam file using jgi_summarize_bam_contig_depths script from metabat2. This depth file will be used also with maxbin2. +```bash + +jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam +``` + +Now we can run metabat2: + +```bash + +metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 + +``` + + +### maxbin2 + +We use the depth file produced by `jgi_summarize_bam_contig_depths` + +```bash + +mkdir -p maxbin2 +run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 + +``` + +### concoct + +Then we can also run concoct with the folowing commands: + +```bash + +mkdir -p concoct/ + +cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 --overlap_size 0 --merge_last --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa + +concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv + +concoct --composition_file concoct/contigs_10K.fa --coverage_file concoct/coverage_table.tsv --basename concoct/bins --threads 12 + +merge_cutup_clustering.py concoct/bins_clustering_gt1000.csv > concoct/clustering_merge.csv + +mkdir -p concoct/bins + +extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.csv --output_path concoct/bins +``` + +### SemiBin2 + +We can launch semibin2 as well with its `single_easy_bin` command. + +```{note} +This take some time so it can be skipped. +``` + +```bash + +SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa -b alignments_bwa/Kickstart.bam -o semibin2/ -p 12 + +``` + diff --git a/docs/tutorial/set_env_and_get_data.md b/docs/tutorial/set_env_and_get_data.md new file mode 100644 index 0000000..b950a44 --- /dev/null +++ b/docs/tutorial/set_env_and_get_data.md @@ -0,0 +1,78 @@ + +## Set tutorial environment + +We will download necessary tool in a dedicated conda envrionnement. + + + +Let's create a directory to run the tutorial: + + +```bash + +mamba env create -f binette_tutorial_env.yaml -n binette_tuto + +``` + + + +## Get the Data + +### Using ncezid-biome datasets tool + +I downloaded the metagenome Kickstart from the above dataset (SAMN05024035) that correspond to this sra SRR5058924 https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa + + +We will donwload the data of the Kickstart (SAMN05024035) dataset this repository that https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb + +We had use conda as detailed here https://github.com/ncezid-biome/datasets/blob/master/INSTALL.md#conda + +Now we can download the Kickstart dataset with the folowing commands. + +We first download the coal-metagenomic table from the github repository : https://github.com/ncezid-biome/datasets/blob/master/datasets/coal-metagenomics.tsv +ANd just select the line corresponding to the Kickstart dataset. + + + + +```bash +# download the coal-metagenomic tsv file from the github repository +wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv + +# select the header of the table as it is necessary for the download + +head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv +grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv + +GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics.tsv + +``` + +It takes around 16min to run + +You should hae the folowing structure +``` +├── coal-metagenomics_Kickstart_only.tsv +└── data + ├── in.tsv + ├── Kickstart_1.fastq.gz + ├── Kickstart_1.fastq.sha256 + ├── Kickstart_2.fastq.gz + ├── Kickstart_2.fastq.sha256 + ├── Makefile + ├── prefetch.done + ├── sha256sum.log + ├── SRR5058924 + │   └── SRR5058924.sra + └── tree.dnd + + +``` + +```{tip} +You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as we do not need it anymore as we will exclusively use the fastq files. with `rm data/SRR5058924/SRR5058924.sra` +``` + +```{note} +You can also download the data using SRA toolkit which what the tool does in the background but add some check sum to ensure data integrity. After instaling sra toolkit (with conda for example : https://anaconda.org/bioconda/sra-tools) you can run the two commands folowing commands to retrived the data: `prefetch SRR5058924` and `fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra` +``` \ No newline at end of file diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md new file mode 100644 index 0000000..1d35a5a --- /dev/null +++ b/docs/tutorial/tutorial_main.md @@ -0,0 +1,52 @@ + +# Tutorial + +The goal of this tutorial is to show an example of commands on how Binette can be used on real data. We will start ou journey from metagenomics reads that we gonna download, then we will assemble these reads in contigs that we will bin with different binning tool. I finally we will use Binette to refine those bins. + + + +```{toctree} +:caption: 'Tutorial' +:maxdepth: 2 + +set_env_and_get_data +assembly +binning +binette +analyse_binette_result.ipynb +``` + + + + + + diff --git a/pyproject.toml b/pyproject.toml index 8ff3ff8..06553ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,11 @@ doc = [ "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", - "docutils==0.18.1" -] + "docutils==0.18.1", + "myst-nb", + "nbsphinx" + ] + dev = [ "pytest>=7.0.0", "pytest-cov" From a1f983c2fd8b60c6e8be6230f5b4f451ef910d7b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:00:04 +0200 Subject: [PATCH 02/20] adjust nbsphinx version in doc deps --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06553ae..9bbee26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,11 +44,10 @@ doc = [ "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", - "docutils==0.18.1", - "myst-nb", - "nbsphinx" + "docutils==0.18.1", #"myst-nb", + "nbsphinx==0.9.5" ] - + dev = [ "pytest>=7.0.0", "pytest-cov" From 41ef85e7396a93b98543378c920ef275eea6d155 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:05:56 +0200 Subject: [PATCH 03/20] manage sphinx deps --- docs/conf.py | 4 ++-- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index bb2e303..eae8582 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,8 +28,8 @@ "myst_parser", 'nbsphinx', 'nbsphinx_link', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', + # 'sphinx.ext.napoleon', + # 'sphinx.ext.viewcode', "myst_parser", ] diff --git a/pyproject.toml b/pyproject.toml index 9bbee26..51802de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,8 @@ doc = [ "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", - "nbsphinx==0.9.5" + "nbsphinx==0.9.5", + nbsphinx_link==1.3.0 ] dev = [ From 6e670cf61b454cf1ea7c6b57a525146cbb864dab Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:07:08 +0200 Subject: [PATCH 04/20] add missing quote --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 51802de..6687291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ doc = [ "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", - nbsphinx_link==1.3.0 + "nbsphinx_link==1.3.0" ] dev = [ From 863352070f51d7eca1d50e96706d6b405192bde8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:25:33 +0200 Subject: [PATCH 05/20] test with sphinx-book-theme --- docs/conf.py | 3 ++- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index eae8582..ff33a85 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,7 +55,8 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' #'alabaster' # +# html_theme = 'sphinx_rtd_theme' #'alabaster' # +html_theme = 'sphinx_book_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/pyproject.toml b/pyproject.toml index 6687291..6fad059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ doc = [ "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", - "nbsphinx_link==1.3.0" + "nbsphinx_link==1.3.0", + "sphinx-book-theme==1.0.1" ] dev = [ From ba8c70b67de9b5b7ccd1fc85010d43a33de174b3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 17:59:14 +0200 Subject: [PATCH 06/20] improve tutorial --- docs/conf.py | 17 +++++- docs/tutorial/assembly.md | 56 ++++++++++--------- docs/tutorial/binette.md | 56 +++++++++++++++++-- docs/tutorial/binning.md | 70 ++++++++++++++---------- docs/tutorial/set_env_and_get_data.md | 78 --------------------------- docs/tutorial/set_environment.md | 27 ++++++++++ docs/tutorial/tutorial_main.md | 73 +++++++++++++++++++++++-- 7 files changed, 236 insertions(+), 141 deletions(-) delete mode 100644 docs/tutorial/set_env_and_get_data.md create mode 100644 docs/tutorial/set_environment.md diff --git a/docs/conf.py b/docs/conf.py index ff33a85..37654c6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,9 +31,22 @@ # 'sphinx.ext.napoleon', # 'sphinx.ext.viewcode', "myst_parser", - + 'sphinxcontrib.mermaid' +] +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", ] - source_suffix = { '.md': 'markdown', diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index cd51e76..c7104ed 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -1,43 +1,49 @@ +## Assemble the Reads +We will use **MEGAHIT** to assemble the reads from our dataset. Run the following command: -## Assemble the reads - -We will use megahit to assemble the reads - -```bash - -cd /home/jmainguy/Analysis/Binette/Binette_tutorial/ncezid-biome_datasets/exec_tutorial_jupyter +```{code-block} bash +megahit -1 coal-metagenomics/Kickstart_1.fastq.gz \ + -2 coal-metagenomics/Kickstart_2.fastq.gz \ + --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 ``` -```bash +:::{admonition} ⌛ Expected Time +:class: note -megahit -1 coal-metagenomics/Kickstart_1.fastq.gz -2 coal-metagenomics/Kickstart_2.fastq.gz --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 - -``` +This process takes approximately 28 minutes to complete. +::: -This take 27m49,879s +```{admonition} Note +:class: note -```{note} -We can use spade as well. It performs generally better that megahit but is generally longer and consume more memory than megahit. See cami benchmark ??? +You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. Refer to the CAMI benchmark for a detailed comparison. ``` +## Align the Reads Over the Assembly +To get coverage information, we first need to map the reads back to the assembly. -## Align the reads over the assembly - -First we need to map the reads back against the assembly to get coverage information - -```bash - +```{code-block} bash +# Create a directory for the alignments mkdir -p alignments_bwa/ +# Index the contigs file using BWA-MEM2 bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa -bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | samtools view -@ 12 -bS - | samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam - -samtools index alignments_bwa/Kickstart.bam +# Map reads back to the assembly, convert to BAM format, and sort +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ +samtools view -@ 12 -bS - | \ +samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam +# Index the BAM file +samtools index alignments_bwa/Kickstart.bam ``` - -This take around 12 minutes \ No newline at end of file + + +:::{admonition} ⌛ Expected Time +:class: note + +This process takes approximately 12 minutes to complete. +::: diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index 687398f..926ce3b 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -1,7 +1,55 @@ -## Run Binette -```{code-cell} bash +## Run Binette -binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ -c Kickstart.megahit/R1.contigs.fa --verbose -t 12 -o binette_results +Binette will use the previously computed bins to refine and improve them, generating a new set of higher-quality bins. + +To run Binette, use the following command: + +```bash +binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ + -c Kickstart.megahit/R1.contigs.fa \ + --verbose -t 12 -o binette_results +``` + +Once Binette completes, the `binette_results` directory should have the following structure: + +```plaintext +binette_results/ +├── final_bins +│ ├── bin_13475.fa +│ ├── bin_17075.fa +│ ├── bin_19689.fa +│ ├── bin_21248.fa +│ ├── bin_31703.fa +│ ├── bin_33569.fa +│ ├── bin_39350.fa +│ ├── bin_39427.fa +│ ├── bin_39558.fa +│ ├── bin_44137.fa +│ ├── bin_46775.fa +│ ├── bin_47060.fa +│ ├── bin_47177.fa +│ ├── bin_47926.fa +│ └── bin_51082.fa +├── final_bins_quality_reports.tsv +├── input_bins_quality_reports +│ ├── input_bins_1.concoct_bins.tsv +│ ├── input_bins_2.maxbin2.tsv +│ ├── input_bins_3.metabat2.tsv +│ └── input_bins_4.semibin2_output_bins.tsv +└── temporary_files + ├── assembly_proteins.faa + ├── diamond_result.log + └── diamond_result.tsv +``` + +### Key Output Files: + +- **`final_bins/`**: Contains the refined bins in FASTA format. +- **`final_bins_quality_reports.tsv`**: A summary report containing CheckM2 metrics for the final bin selection. +- **`input_bins_quality_reports/`**: Quality reports for each of the input bin sets from MaxBin2, MetaBAT2, CONCOCT, and SemiBin2. + +### Next Steps + +In the next section, we will use `final_bins_quality_reports.tsv` along with the reports from `binette_results/input_bins_quality_reports` to visualize Binette's bins and compare them with the initial bin sets. -``` \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index 36efbed..8624cf6 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,49 +1,62 @@ +## Run Binning Tools -## Run binning tools - +In this section, we'll use different binning tools to group contigs of assembly. -### metabat2 +### MetaBAT2 -We first generate a depth file from the bam file using jgi_summarize_bam_contig_depths script from metabat2. This depth file will be used also with maxbin2. -```bash +First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used for MaxBin2. -jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam +```bash +jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam ``` -Now we can run metabat2: +Now, run MetaBAT2 with the generated depth file: ```bash +metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 +``` -metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 +### MaxBin2 +We will use the same depth file produced by `jgi_summarize_bam_contig_depths` for MetaBAT2: + +```bash +mkdir -p maxbin2 +run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa \ + -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 ``` +### CONCOCT -### maxbin2 +To run CONCOCT, follow these steps: -We use the depth file produced by `jgi_summarize_bam_contig_depths` +1. **Cut up the FASTA file** into chunks for processing: ```bash +mkdir -p concoct/ -mkdir -p maxbin2 -run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 - +cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 \ + --overlap_size 0 --merge_last \ + --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa ``` -### concoct - -Then we can also run concoct with the folowing commands: +2. **Generate the coverage table** from the BAM file: ```bash +concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv +``` -mkdir -p concoct/ - -cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 --overlap_size 0 --merge_last --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa +3. **Run CONCOCT** with the composition and coverage files: -concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv +```bash +concoct --composition_file concoct/contigs_10K.fa \ + --coverage_file concoct/coverage_table.tsv \ + --basename concoct/bins --threads 12 +``` -concoct --composition_file concoct/contigs_10K.fa --coverage_file concoct/coverage_table.tsv --basename concoct/bins --threads 12 +4. **Merge the clustering results** and extract bins: +```bash merge_cutup_clustering.py concoct/bins_clustering_gt1000.csv > concoct/clustering_merge.csv mkdir -p concoct/bins @@ -53,15 +66,16 @@ extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.c ### SemiBin2 -We can launch semibin2 as well with its `single_easy_bin` command. +You can also run SemiBin2 with its `single_easy_bin` command: + +```{admonition} ⏳ Time Note +:class: note -```{note} -This take some time so it can be skipped. +This process can take some time, so it may be skipped. ``` ```bash - -SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa -b alignments_bwa/Kickstart.bam -o semibin2/ -p 12 - +SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ + -b alignments_bwa/Kickstart.bam \ + -o semibin2/ -p 12 ``` - diff --git a/docs/tutorial/set_env_and_get_data.md b/docs/tutorial/set_env_and_get_data.md deleted file mode 100644 index b950a44..0000000 --- a/docs/tutorial/set_env_and_get_data.md +++ /dev/null @@ -1,78 +0,0 @@ - -## Set tutorial environment - -We will download necessary tool in a dedicated conda envrionnement. - - - -Let's create a directory to run the tutorial: - - -```bash - -mamba env create -f binette_tutorial_env.yaml -n binette_tuto - -``` - - - -## Get the Data - -### Using ncezid-biome datasets tool - -I downloaded the metagenome Kickstart from the above dataset (SAMN05024035) that correspond to this sra SRR5058924 https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa - - -We will donwload the data of the Kickstart (SAMN05024035) dataset this repository that https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb - -We had use conda as detailed here https://github.com/ncezid-biome/datasets/blob/master/INSTALL.md#conda - -Now we can download the Kickstart dataset with the folowing commands. - -We first download the coal-metagenomic table from the github repository : https://github.com/ncezid-biome/datasets/blob/master/datasets/coal-metagenomics.tsv -ANd just select the line corresponding to the Kickstart dataset. - - - - -```bash -# download the coal-metagenomic tsv file from the github repository -wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv - -# select the header of the table as it is necessary for the download - -head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv -grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv - -GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics.tsv - -``` - -It takes around 16min to run - -You should hae the folowing structure -``` -├── coal-metagenomics_Kickstart_only.tsv -└── data - ├── in.tsv - ├── Kickstart_1.fastq.gz - ├── Kickstart_1.fastq.sha256 - ├── Kickstart_2.fastq.gz - ├── Kickstart_2.fastq.sha256 - ├── Makefile - ├── prefetch.done - ├── sha256sum.log - ├── SRR5058924 - │   └── SRR5058924.sra - └── tree.dnd - - -``` - -```{tip} -You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as we do not need it anymore as we will exclusively use the fastq files. with `rm data/SRR5058924/SRR5058924.sra` -``` - -```{note} -You can also download the data using SRA toolkit which what the tool does in the background but add some check sum to ensure data integrity. After instaling sra toolkit (with conda for example : https://anaconda.org/bioconda/sra-tools) you can run the two commands folowing commands to retrived the data: `prefetch SRR5058924` and `fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra` -``` \ No newline at end of file diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md new file mode 100644 index 0000000..52d6092 --- /dev/null +++ b/docs/tutorial/set_environment.md @@ -0,0 +1,27 @@ +## Set Up the Tutorial Environment + +To get started, we'll download the necessary tools and set them up in a dedicated Conda environment. + +### Create a Conda Environment + +First, let's create a new Conda environment specifically for this tutorial: + +```{code-block} bash +mamba env create -f binette_tutorial_env.yaml -n binette_tuto +``` + +This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. + +### Activate the Environment + +After the environment is created, activate it by running: + +```{code-block} bash +conda activate binette_tuto +``` + +Below is the content of the `binette_tutorial_env.yaml` file: + +```{include} binette_tutorial_env.yaml +:code: yaml +``` diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 1d35a5a..d21931c 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,15 +1,80 @@ # Tutorial -The goal of this tutorial is to show an example of commands on how Binette can be used on real data. We will start ou journey from metagenomics reads that we gonna download, then we will assemble these reads in contigs that we will bin with different binning tool. I finally we will use Binette to refine those bins. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs. Finally, we'll use Binette to refine these bins and improve our results. +```{mermaid} +--- +title: "Tutorial Overview:" +align: center +--- + +%%{init: {'theme':'default'}}%% + +graph LR + + A[Download Metagenomics Reads] --> B + B[Assemble Reads into Contigs] --> c + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + + C[Bin Contigs with Binning Tools] --> D[Refine Bins with Binette] + + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + +```{mermaid} + +--- +title: "Tutorial Overview:" +align: center +--- + + +graph TD + + i[Get Metagenomics Reads] --> B[Assembly & Reads alignment] + + + B --> metabat2 --> r[Binette] + B --> maxbin2 --> r + B --> concoct --> r + B --> semibin2 --> r + + subgraph Binning + metabat2:::binning + maxbin2:::binning + concoct:::binning + semibin2:::binning + end + + + classDef binning fill:#d4ae40 + + +``` ```{toctree} -:caption: 'Tutorial' -:maxdepth: 2 +:caption: 'Tutorial steps' +:maxdepth: 1 -set_env_and_get_data +set_environment +get_dataset assembly binning binette From de93964300e81baa9878aa390e10c0038f981d69 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 18:02:13 +0200 Subject: [PATCH 07/20] add mermaid deps --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6fad059..8f12e86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,8 @@ doc = [ "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", "nbsphinx_link==1.3.0", - "sphinx-book-theme==1.0.1" + "sphinx-book-theme==1.0.1", + "sphinxcontrib.mermaid" ] dev = [ From 7f2c52f06a1c04b102dcffa0877802e0c6cdd222 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 12:04:41 +0200 Subject: [PATCH 08/20] improve tutorial --- docs/conf.py | 4 +- docs/tutorial/analyse_binette_result.ipynb | 1365 ++++------------- docs/tutorial/assembly.md | 21 +- .../final_bins_quality_reports.tsv | 16 + .../input_bins_1.concoct_bins.tsv | 64 + .../input_bins_2.maxbin2.tsv | 24 + .../input_bins_3.metabat2.tsv | 15 + .../input_bins_4.semibin2_output_bins.tsv | 26 + docs/tutorial/binning.md | 44 +- docs/tutorial/tutorial_main.md | 52 +- 10 files changed, 538 insertions(+), 1093 deletions(-) create mode 100644 docs/tutorial/binette_results/final_bins_quality_reports.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv diff --git a/docs/conf.py b/docs/conf.py index 37654c6..245781e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api", "jupyter_execute"] @@ -69,7 +69,7 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output # html_theme = 'sphinx_rtd_theme' #'alabaster' # -html_theme = 'sphinx_book_theme' +html_theme = 'sphinx_rtd_theme' #'sphinx_book_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/tutorial/analyse_binette_result.ipynb b/docs/tutorial/analyse_binette_result.ipynb index 30247a8..94d1fc8 100644 --- a/docs/tutorial/analyse_binette_result.ipynb +++ b/docs/tutorial/analyse_binette_result.ipynb @@ -2,31 +2,57 @@ "cells": [ { "cell_type": "markdown", - "id": "52e7f39c", + "id": "edcb3b82", "metadata": {}, "source": [ - "## Analyse Binette results" + "## Analyse Binette results\n", + "\n", + "Let's visualize the results from Binette and compare them to the initial bin sets used as input. " + ] + }, + { + "cell_type": "markdown", + "id": "dbe1d73b", + "metadata": {}, + "source": [ + "### Import Necessary Libraries\n", + "\n", + "First, we'll need to import the necessary libraries for our analysis and plotting:" ] }, { "cell_type": "code", "execution_count": 1, - "id": "e6a1e1ee-681d-4823-b974-7027bafd2ba9", + "id": "9e9153ef", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "import plotly.express as px\n", + "\n", + "# This is needed to properly display Plotly graphs in the documentation\n", "import plotly.io as pio\n", "pio.renderers.default = \"sphinx_gallery\"" ] }, + { + "cell_type": "markdown", + "id": "b93e8a0e", + "metadata": {}, + "source": [ + "### Load Binette Results\n", + "\n", + "Now, let's load the final Binette quality report into a Pandas DataFrame:" + ] + }, { "cell_type": "code", "execution_count": 2, - "id": "34e80119-f59b-41b0-b0e5-de2d6ed0c6a3", - "metadata": {}, + "id": "d95ad45c", + "metadata": { + "lines_to_next_cell": 0 + }, "outputs": [ { "data": { @@ -321,15 +347,25 @@ "source": [ "binette_result_file = \"./binette_results/final_bins_quality_reports.tsv\"\n", "df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", - "df_binette['tool'] = \"binette\"\n", - "df_binette['index'] = df_binette.index\n", + "df_binette['tool'] = \"binette\" # Add a column to label the tool\n", + "df_binette['index'] = df_binette.index # Add an index column\n", "df_binette" ] }, + { + "cell_type": "markdown", + "id": "c1372a73", + "metadata": {}, + "source": [ + "### Load and Combine Input Bin Quality Reports\n", + "\n", + "Next, we will load the quality reports of the input bin sets, computed by various tools and saved by Binette. We’ll combine these into a single DataFrame and add a column to indicate high-quality bins. We define a high-quality bin as one with contamination ≤ 5% and completeness ≥ 90%." + ] + }, { "cell_type": "code", "execution_count": 3, - "id": "189038d3-77a0-435a-9590-4d8b3038341e", + "id": "fcb016f2", "metadata": {}, "outputs": [ { @@ -353,98 +389,134 @@ " \n", " \n", " \n", + " tool\n", " completeness\n", " contamination\n", - " tool\n", + " size\n", + " N50\n", + " contig_count\n", " \n", " \n", " \n", " \n", " 0\n", + " binette\n", " 100.00\n", " 0.05\n", - " binette\n", + " 4672665\n", + " 82084\n", + " 93\n", " \n", " \n", " 1\n", + " binette\n", " 99.90\n", " 0.20\n", - " binette\n", + " 2796605\n", + " 41151\n", + " 98\n", " \n", " \n", " 2\n", + " binette\n", " 98.59\n", " 0.83\n", - " binette\n", + " 4601336\n", + " 41016\n", + " 165\n", " \n", " \n", " 3\n", + " binette\n", " 96.10\n", " 0.34\n", - " binette\n", + " 2598718\n", + " 11891\n", + " 312\n", " \n", " \n", " 4\n", + " binette\n", " 91.98\n", " 1.71\n", - " binette\n", + " 1768095\n", + " 9976\n", + " 250\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 20\n", + " semibin2\n", " 8.28\n", " 0.01\n", - " semibin2\n", + " 358822\n", + " 3296\n", + " 106\n", " \n", " \n", " 21\n", + " semibin2\n", " 8.12\n", " 0.02\n", - " semibin2\n", + " 353499\n", + " 3949\n", + " 90\n", " \n", " \n", " 22\n", + " semibin2\n", " 7.74\n", " 0.01\n", - " semibin2\n", + " 351540\n", + " 4284\n", + " 85\n", " \n", " \n", " 23\n", + " semibin2\n", " 6.18\n", " 0.00\n", - " semibin2\n", + " 250833\n", + " 3607\n", + " 66\n", " \n", " \n", " 24\n", + " semibin2\n", " 4.41\n", " 0.13\n", - " semibin2\n", + " 217541\n", + " 3425\n", + " 64\n", " \n", " \n", "\n", - "

140 rows × 3 columns

\n", + "

140 rows × 6 columns

\n", "" ], "text/plain": [ - " completeness contamination tool\n", - "0 100.00 0.05 binette\n", - "1 99.90 0.20 binette\n", - "2 98.59 0.83 binette\n", - "3 96.10 0.34 binette\n", - "4 91.98 1.71 binette\n", - ".. ... ... ...\n", - "20 8.28 0.01 semibin2\n", - "21 8.12 0.02 semibin2\n", - "22 7.74 0.01 semibin2\n", - "23 6.18 0.00 semibin2\n", - "24 4.41 0.13 semibin2\n", + " tool completeness contamination size N50 contig_count\n", + "0 binette 100.00 0.05 4672665 82084 93\n", + "1 binette 99.90 0.20 2796605 41151 98\n", + "2 binette 98.59 0.83 4601336 41016 165\n", + "3 binette 96.10 0.34 2598718 11891 312\n", + "4 binette 91.98 1.71 1768095 9976 250\n", + ".. ... ... ... ... ... ...\n", + "20 semibin2 8.28 0.01 358822 3296 106\n", + "21 semibin2 8.12 0.02 353499 3949 90\n", + "22 semibin2 7.74 0.01 351540 4284 85\n", + "23 semibin2 6.18 0.00 250833 3607 66\n", + "24 semibin2 4.41 0.13 217541 3425 64\n", "\n", - "[140 rows x 3 columns]" + "[140 rows x 6 columns]" ] }, "execution_count": 3, @@ -453,36 +525,51 @@ } ], "source": [ + "from pathlib import Path\n", + "\n", "input_bins_quality_reports_dir = Path(\"binette_results/input_bins_quality_reports/\")\n", "\n", + "# Initialize the list with Binette results\n", "df_input_bin_list = [df_binette]\n", + "\n", + "# Load each input bin quality report\n", "for input_bin_metric_file in input_bins_quality_reports_dir.glob(\"*tsv\"):\n", - " tool = input_bin_metric_file.name.split('.')[1].split('_')[0]\n", + " tool = input_bin_metric_file.name.split('.')[1].split('_')[0] # Extract tool name from file name\n", " df_input = pd.read_csv(input_bin_metric_file, sep='\\t')\n", " df_input['index'] = df_input.index\n", " df_input['tool'] = tool\n", " df_input_bin_list.append(df_input)\n", "\n", - "df_bins = pd.concat(df_input_bin_list)\n", - " \n", - "set(df_bins['tool'])\n", - "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", - "#df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", - "#df_binette\n", - "df_bins[[\"completeness\", \"contamination\", \"tool\"]]\n" + "# Combine all DataFrames into one\n", + "df_bins = pd.concat(df_input_bin_list)\n", + "\n", + "# Add a column to indicate high-quality bins\n", + "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", + "\n", + "# Display relevant columns\n", + "df_bins[[ \"tool\", \"completeness\", \"contamination\", \"size\", \"N50\", \"contig_count\"]]\n" + ] + }, + { + "cell_type": "markdown", + "id": "80ef2544", + "metadata": {}, + "source": [ + "### Plot bin completeness and contamination\n", + "With the DataFrame containing both Binette’s final bins and the input bins, we can now create a scatter plot to visualize the results:" ] }, { "cell_type": "code", "execution_count": 4, - "id": "911d598f-a6c7-4178-aff2-6059235e7fc4", + "id": "277cb781", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -490,25 +577,63 @@ } ], "source": [ - "fig = px.scatter(df_bins, x=\"completeness\",y=\"contamination\", color=\"High quality bin\", size=\"size\", facet_row=\"tool\")\n", + "import plotly.express as px\n", + "\n", + "# Create a scatter plot to visualize completeness and contamination\n", + "fig = px.scatter(df_bins, \n", + " x=\"completeness\", \n", + " y=\"contamination\", \n", + " color=\"High quality bin\", \n", + " size=\"size\", \n", + " facet_row=\"tool\",\n", + " title=\"Bin Quality Comparison\",\n", + " )\n", + "\n", + "# Update layout for better visibility\n", "fig.update_layout(\n", - " width=800,\n", - " height=800)\n", - " \n", + " width=600,\n", + " height=800,\n", + " legend_title=\"High Quality Bin\",\n", + " title=\"Comparison of Bin Quality Metrics\"\n", + ")\n", + "\n", + "# Show the plot\n", "fig.show()" ] }, + { + "cell_type": "markdown", + "id": "06a14412", + "metadata": {}, + "source": [ + "We can see that binette bins are the one displaying the most high quality bins (completeness ≥ 90% and contamination ≤ 5%).\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "323f5637", + "metadata": {}, + "source": [ + "### Comparing Binning Tools Using Bin Score Curves\n", + "\n", + "A common way to compare bin sets is by sorting the bins based on their scores and plotting them against their index.\n", + "\n", + "Here’s how we can create such a plot:" + ] + }, { "cell_type": "code", "execution_count": 5, - "id": "35c46beb-1ac9-4014-9672-91edcc1bf439", + "id": "79faaa3a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -516,18 +641,79 @@ } ], "source": [ - "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2*df_bins['contamination']\n", - "fig = px.line(df_bins, x=\"index\",y='completeness - 2*contamination', color=\"tool\",markers=True)\n", - "fig.update_layout(\n", - " width=800,\n", - " height=500)\n", + "# Calculate the score for each bin\n", + "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2 * df_bins['contamination']\n", + "\n", + "# Plot the score against the bin index\n", + "fig = px.line(df_bins, x=\"index\", y='completeness - 2*contamination', color=\"tool\", markers=True)\n", + "fig.update_layout(width=600, height=500)\n", "fig.show()" ] }, + { + "cell_type": "markdown", + "id": "97aee4d0", + "metadata": {}, + "source": [ + "From the plot, you might notice that Concoct has a lot of bins with lower quality scores. Let’s zoom in to get a better look:" + ] + }, { "cell_type": "code", "execution_count": 6, - "id": "af74bfb2-457c-4cf4-9c13-3ee9642be7ce", + "id": "063974f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Adjust the plot view to zoom in\n", + "fig.update_layout(\n", + " xaxis_range=[-1, 20], # Zoom on x-axis\n", + " yaxis_range=[0, 100], # Zoom on y-axis\n", + " width=600,\n", + " height=500\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "136b17e4", + "metadata": {}, + "source": [ + "Binette line consistently appears above the other binning tools. This indicates that Binette produce higher-quality bins compared to the initial bin sets." + ] + }, + { + "cell_type": "markdown", + "id": "46f1b3d0", + "metadata": {}, + "source": [ + "### Plot Number of High-Quality Bins per Bin Set\n", + "\n", + "Let's plot the number of bins falling into different quality categories. We’ll focus on bins with a maximum of 10% contamination and classify them into three completeness categories:\n", + "\n", + "- **`> 50% and ≤ 70%`**\n", + "- **`> 70% and ≤ 90%`**\n", + "- **`> 90%`**\n", + "\n", + "First, let’s group and count the bins in each category:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "943f88b4", "metadata": {}, "outputs": [ { @@ -551,1006 +737,47 @@ " \n", " \n", " \n", - " bin_id\n", - " origin\n", - " name\n", - " completeness\n", - " contamination\n", - " score\n", - " size\n", - " N50\n", - " contig_count\n", - " tool\n", - " index\n", - " High quality bin\n", - " completeness - 2*contamination\n", " Contamination ≤ 10 and<br>Completeness\n", + " tool\n", + " bin_count\n", " \n", " \n", " \n", " \n", " 0\n", - " 17075\n", - " diff\n", - " 44 - 10\n", - " 100.00\n", - " 0.05\n", - " 99.90\n", - " 4672665\n", - " 82084\n", - " 93\n", + " > 50% and ≤ 70%\n", " binette\n", - " 0\n", - " True\n", - " 99.90\n", - " > 90%\n", + " 5\n", " \n", " \n", " 1\n", - " 39427\n", - " diff\n", - " 36 - 6\n", - " 99.90\n", - " 0.20\n", - " 99.50\n", - " 2796605\n", - " 41151\n", - " 98\n", - " binette\n", + " > 50% and ≤ 70%\n", + " maxbin2\n", " 1\n", - " True\n", - " 99.50\n", - " > 90%\n", " \n", " \n", " 2\n", - " 47060\n", - " union\n", - " 58 | 33\n", - " 98.59\n", - " 0.83\n", - " 96.93\n", - " 4601336\n", - " 41016\n", - " 165\n", - " binette\n", - " 2\n", - " True\n", - " 96.93\n", - " > 90%\n", + " > 50% and ≤ 70%\n", + " metabat2\n", + " 1\n", " \n", " \n", " 3\n", - " 47177\n", - " union\n", - " 91 | 25 | 55\n", - " 96.10\n", - " 0.34\n", - " 95.42\n", - " 2598718\n", - " 11891\n", - " 312\n", - " binette\n", - " 3\n", - " True\n", - " 95.42\n", - " > 90%\n", + " > 50% and ≤ 70%\n", + " semibin2\n", + " 2\n", " \n", " \n", " 4\n", - " 21248\n", - " diff\n", - " 65 - 8 - 28\n", - " 91.98\n", - " 1.71\n", - " 88.56\n", - " 1768095\n", - " 9976\n", - " 250\n", + " > 70% and ≤ 90%\n", " binette\n", - " 4\n", - " True\n", - " 88.56\n", - " > 90%\n", + " 3\n", " \n", " \n", " 5\n", - " 44137\n", - " diff\n", - " 76 - 13 - 28\n", - " 92.63\n", - " 2.41\n", - " 87.81\n", - " 3726254\n", - " 5669\n", - " 850\n", - " binette\n", - " 5\n", - " True\n", - " 87.81\n", - " > 90%\n", - " \n", - " \n", - " 6\n", - " 31703\n", - " diff\n", - " 31 - 7 - 61\n", - " 81.73\n", - " 0.84\n", - " 80.05\n", - " 1665233\n", - " 8518\n", - " 248\n", - " binette\n", - " 6\n", - " False\n", - " 80.05\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 7\n", - " 13475\n", - " diff\n", - " 47 - 37\n", - " 72.89\n", - " 2.39\n", - " 68.11\n", - " 1241829\n", - " 5061\n", - " 252\n", - " binette\n", - " 7\n", - " False\n", - " 68.11\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 8\n", - " 47926\n", - " union\n", - " 75 | 30\n", - " 74.31\n", - " 4.26\n", - " 65.79\n", - " 3293949\n", - " 2954\n", - " 1262\n", - " binette\n", - " 8\n", - " False\n", - " 65.79\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 9\n", - " 46775\n", - " union\n", - " 42 | 102\n", - " 62.94\n", - " 2.75\n", - " 57.44\n", - " 1293571\n", - " 3783\n", - " 419\n", - " binette\n", - " 9\n", - " False\n", - " 57.44\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 10\n", - " 33569\n", - " diff\n", - " 83 - 7 - 38 - 31\n", - " 59.18\n", - " 2.24\n", - " 54.70\n", - " 2042527\n", - " 4437\n", - " 514\n", - " binette\n", - " 10\n", - " False\n", - " 54.70\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 11\n", - " 39350\n", - " diff\n", - " 57 - 16 - 75\n", - " 52.16\n", - " 1.31\n", - " 49.54\n", - " 2601282\n", - " 5332\n", - " 509\n", - " binette\n", - " 11\n", - " False\n", - " 49.54\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 12\n", - " 39558\n", - " diff\n", - " 78 - 6 - 43\n", - " 64.63\n", - " 8.03\n", - " 48.57\n", - " 1858210\n", - " 1430\n", - " 1293\n", - " binette\n", - " 12\n", - " False\n", - " 48.57\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 13\n", - " 51082\n", - " union\n", - " 120 | 1\n", - " 52.33\n", - " 5.06\n", - " 42.21\n", - " 688879\n", - " 1446\n", - " 472\n", - " binette\n", - " 13\n", - " False\n", - " 42.21\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 0\n", - " 125\n", - " concoct/bins\n", - " 9.fa\n", - " 100.00\n", - " 0.38\n", - " 99.24\n", - " 3033586\n", - " 37523\n", - " 131\n", - " concoct\n", - " 0\n", - " True\n", - " 99.24\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 67\n", - " concoct/bins\n", - " 41.fa\n", - " 100.00\n", - " 0.46\n", - " 99.08\n", - " 4765466\n", - " 82084\n", - " 101\n", - " concoct\n", - " 1\n", - " True\n", - " 99.08\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 91\n", - " concoct/bins\n", - " 7.fa\n", - " 92.76\n", - " 0.34\n", - " 92.08\n", - " 2274951\n", - " 12187\n", - " 265\n", - " concoct\n", - " 2\n", - " True\n", - " 92.08\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 76\n", - " concoct/bins\n", - " 6.fa\n", - " 92.63\n", - " 3.42\n", - " 85.79\n", - " 3751950\n", - " 5674\n", - " 855\n", - " concoct\n", - " 3\n", - " True\n", - " 85.79\n", - " > 90%\n", - " \n", - " \n", - " 4\n", - " 65\n", - " concoct/bins\n", - " 62.fa\n", - " 87.35\n", - " 1.80\n", - " 83.75\n", - " 1917859\n", - " 10911\n", - " 259\n", - " concoct\n", - " 4\n", - " False\n", - " 83.75\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 6\n", - " 75\n", - " concoct/bins\n", - " 48.fa\n", - " 73.35\n", - " 4.26\n", - " 64.83\n", - " 3285374\n", - " 2950\n", - " 1261\n", - " concoct\n", - " 6\n", - " False\n", - " 64.83\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 0\n", - " 22\n", - " maxbin2\n", - " maxbin2.001.fasta\n", - " 99.81\n", - " 4.81\n", - " 90.19\n", - " 4616818\n", - " 89436\n", - " 133\n", - " maxbin2\n", - " 0\n", - " True\n", - " 90.19\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 14\n", - " maxbin2\n", - " maxbin2.002.fasta\n", - " 93.92\n", - " 3.53\n", - " 86.86\n", - " 2874373\n", - " 37523\n", - " 195\n", - " maxbin2\n", - " 1\n", - " True\n", - " 86.86\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 5\n", - " maxbin2\n", - " maxbin2.009.fasta\n", - " 62.69\n", - " 8.14\n", - " 46.41\n", - " 2438492\n", - " 6141\n", - " 604\n", - " maxbin2\n", - " 3\n", - " False\n", - " 46.41\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 0\n", - " 36\n", - " metabat2\n", - " metabat2.14.fa\n", - " 99.90\n", - " 0.24\n", - " 99.42\n", - " 2799572\n", - " 41151\n", - " 99\n", - " metabat2\n", - " 0\n", - " True\n", - " 99.42\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 25\n", - " metabat2\n", - " metabat2.8.fa\n", - " 93.17\n", - " 0.22\n", - " 92.73\n", - " 2148097\n", - " 12225\n", - " 226\n", - " metabat2\n", - " 1\n", - " True\n", - " 92.73\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 33\n", - " metabat2\n", - " metabat2.12.fa\n", - " 93.52\n", - " 0.92\n", - " 91.68\n", - " 4266134\n", - " 39217\n", - " 157\n", - " metabat2\n", - " 2\n", - " True\n", - " 91.68\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 27\n", - " metabat2\n", - " metabat2.11.fa\n", - " 84.40\n", - " 1.53\n", - " 81.34\n", - " 1902761\n", - " 11352\n", - " 218\n", - " metabat2\n", - " 3\n", - " False\n", - " 81.34\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 4\n", - " 37\n", - " metabat2\n", - " metabat2.1.fa\n", - " 84.99\n", - " 2.73\n", - " 79.53\n", - " 2980526\n", - " 6876\n", - " 502\n", - " metabat2\n", - " 4\n", - " False\n", - " 79.53\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 5\n", - " 31\n", - " metabat2\n", - " metabat2.2.fa\n", - " 83.21\n", - " 3.16\n", - " 76.89\n", - " 1807028\n", - " 7852\n", - " 274\n", - " metabat2\n", - " 5\n", - " False\n", - " 76.89\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 6\n", - " 35\n", - " metabat2\n", - " metabat2.4.fa\n", - " 76.53\n", - " 0.11\n", - " 76.31\n", - " 3477636\n", - " 82084\n", - " 71\n", - " metabat2\n", - " 6\n", - " False\n", - " 76.31\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 7\n", - " 29\n", - " metabat2\n", - " metabat2.7.fa\n", - " 71.78\n", - " 5.77\n", - " 60.24\n", - " 1384653\n", - " 4937\n", - " 292\n", - " metabat2\n", - " 7\n", - " False\n", - " 60.24\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 8\n", - " 24\n", - " metabat2\n", - " metabat2.3.fa\n", - " 51.75\n", - " 2.99\n", - " 45.77\n", - " 1707078\n", - " 4929\n", - " 362\n", - " metabat2\n", - " 8\n", - " False\n", - " 45.77\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 0\n", - " 44\n", - " semibin2/output_bins\n", - " SemiBin_27.fa.gz\n", - " 100.00\n", - " 0.09\n", - " 99.82\n", - " 4681369\n", - " 82084\n", - " 94\n", - " semibin2\n", - " 0\n", - " True\n", - " 99.82\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 53\n", - " semibin2/output_bins\n", - " SemiBin_33.fa.gz\n", - " 99.92\n", - " 0.28\n", - " 99.36\n", - " 2937678\n", - " 37523\n", - " 113\n", - " semibin2\n", - " 1\n", - " True\n", - " 99.36\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 50\n", - " semibin2/output_bins\n", - " SemiBin_10.fa.gz\n", - " 93.43\n", - " 0.14\n", - " 93.15\n", - " 2129295\n", - " 12519\n", - " 216\n", - " semibin2\n", - " 2\n", - " True\n", - " 93.15\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 62\n", - " semibin2/output_bins\n", - " SemiBin_24.fa.gz\n", - " 92.13\n", - " 0.03\n", - " 92.07\n", - " 4162911\n", - " 40395\n", - " 139\n", - " semibin2\n", - " 3\n", - " True\n", - " 92.07\n", - " > 90%\n", - " \n", - " \n", - " 4\n", - " 38\n", - " semibin2/output_bins\n", - " SemiBin_26.fa.gz\n", - " 83.09\n", - " 2.25\n", - " 78.59\n", - " 1674156\n", - " 8389\n", - " 245\n", - " semibin2\n", - " 4\n", - " False\n", - " 78.59\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 5\n", - " 49\n", - " semibin2/output_bins\n", - " SemiBin_32.fa.gz\n", - " 81.87\n", - " 1.66\n", - " 78.55\n", - " 1820073\n", - " 11737\n", - " 205\n", - " semibin2\n", - " 5\n", - " False\n", - " 78.55\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 6\n", - " 60\n", - " semibin2/output_bins\n", - " SemiBin_22.fa.gz\n", - " 80.25\n", - " 1.63\n", - " 76.99\n", - " 2790948\n", - " 7117\n", - " 450\n", - " semibin2\n", - " 6\n", - " False\n", - " 76.99\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 7\n", - " 47\n", - " semibin2/output_bins\n", - " SemiBin_11.fa.gz\n", - " 72.57\n", - " 2.45\n", - " 67.67\n", - " 1245031\n", - " 5061\n", - " 253\n", - " semibin2\n", - " 7\n", - " False\n", - " 67.67\n", - " > 70% and ≤ 90%\n", - " \n", - " \n", - " 8\n", - " 61\n", - " semibin2/output_bins\n", - " SemiBin_3.fa.gz\n", - " 53.34\n", - " 1.33\n", - " 50.68\n", - " 1728690\n", - " 4913\n", - " 367\n", - " semibin2\n", - " 8\n", - " False\n", - " 50.68\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - " 9\n", - " 57\n", - " semibin2/output_bins\n", - " SemiBin_12.fa.gz\n", - " 51.92\n", - " 1.31\n", - " 49.30\n", - " 2609451\n", - " 5292\n", - " 511\n", - " semibin2\n", - " 9\n", - " False\n", - " 49.30\n", - " > 50% and ≤ 70%\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " bin_id origin name completeness \\\n", - "0 17075 diff 44 - 10 100.00 \n", - "1 39427 diff 36 - 6 99.90 \n", - "2 47060 union 58 | 33 98.59 \n", - "3 47177 union 91 | 25 | 55 96.10 \n", - "4 21248 diff 65 - 8 - 28 91.98 \n", - "5 44137 diff 76 - 13 - 28 92.63 \n", - "6 31703 diff 31 - 7 - 61 81.73 \n", - "7 13475 diff 47 - 37 72.89 \n", - "8 47926 union 75 | 30 74.31 \n", - "9 46775 union 42 | 102 62.94 \n", - "10 33569 diff 83 - 7 - 38 - 31 59.18 \n", - "11 39350 diff 57 - 16 - 75 52.16 \n", - "12 39558 diff 78 - 6 - 43 64.63 \n", - "13 51082 union 120 | 1 52.33 \n", - "0 125 concoct/bins 9.fa 100.00 \n", - "1 67 concoct/bins 41.fa 100.00 \n", - "2 91 concoct/bins 7.fa 92.76 \n", - "3 76 concoct/bins 6.fa 92.63 \n", - "4 65 concoct/bins 62.fa 87.35 \n", - "6 75 concoct/bins 48.fa 73.35 \n", - "0 22 maxbin2 maxbin2.001.fasta 99.81 \n", - "1 14 maxbin2 maxbin2.002.fasta 93.92 \n", - "3 5 maxbin2 maxbin2.009.fasta 62.69 \n", - "0 36 metabat2 metabat2.14.fa 99.90 \n", - "1 25 metabat2 metabat2.8.fa 93.17 \n", - "2 33 metabat2 metabat2.12.fa 93.52 \n", - "3 27 metabat2 metabat2.11.fa 84.40 \n", - "4 37 metabat2 metabat2.1.fa 84.99 \n", - "5 31 metabat2 metabat2.2.fa 83.21 \n", - "6 35 metabat2 metabat2.4.fa 76.53 \n", - "7 29 metabat2 metabat2.7.fa 71.78 \n", - "8 24 metabat2 metabat2.3.fa 51.75 \n", - "0 44 semibin2/output_bins SemiBin_27.fa.gz 100.00 \n", - "1 53 semibin2/output_bins SemiBin_33.fa.gz 99.92 \n", - "2 50 semibin2/output_bins SemiBin_10.fa.gz 93.43 \n", - "3 62 semibin2/output_bins SemiBin_24.fa.gz 92.13 \n", - "4 38 semibin2/output_bins SemiBin_26.fa.gz 83.09 \n", - "5 49 semibin2/output_bins SemiBin_32.fa.gz 81.87 \n", - "6 60 semibin2/output_bins SemiBin_22.fa.gz 80.25 \n", - "7 47 semibin2/output_bins SemiBin_11.fa.gz 72.57 \n", - "8 61 semibin2/output_bins SemiBin_3.fa.gz 53.34 \n", - "9 57 semibin2/output_bins SemiBin_12.fa.gz 51.92 \n", - "\n", - " contamination score size N50 contig_count tool index \\\n", - "0 0.05 99.90 4672665 82084 93 binette 0 \n", - "1 0.20 99.50 2796605 41151 98 binette 1 \n", - "2 0.83 96.93 4601336 41016 165 binette 2 \n", - "3 0.34 95.42 2598718 11891 312 binette 3 \n", - "4 1.71 88.56 1768095 9976 250 binette 4 \n", - "5 2.41 87.81 3726254 5669 850 binette 5 \n", - "6 0.84 80.05 1665233 8518 248 binette 6 \n", - "7 2.39 68.11 1241829 5061 252 binette 7 \n", - "8 4.26 65.79 3293949 2954 1262 binette 8 \n", - "9 2.75 57.44 1293571 3783 419 binette 9 \n", - "10 2.24 54.70 2042527 4437 514 binette 10 \n", - "11 1.31 49.54 2601282 5332 509 binette 11 \n", - "12 8.03 48.57 1858210 1430 1293 binette 12 \n", - "13 5.06 42.21 688879 1446 472 binette 13 \n", - "0 0.38 99.24 3033586 37523 131 concoct 0 \n", - "1 0.46 99.08 4765466 82084 101 concoct 1 \n", - "2 0.34 92.08 2274951 12187 265 concoct 2 \n", - "3 3.42 85.79 3751950 5674 855 concoct 3 \n", - "4 1.80 83.75 1917859 10911 259 concoct 4 \n", - "6 4.26 64.83 3285374 2950 1261 concoct 6 \n", - "0 4.81 90.19 4616818 89436 133 maxbin2 0 \n", - "1 3.53 86.86 2874373 37523 195 maxbin2 1 \n", - "3 8.14 46.41 2438492 6141 604 maxbin2 3 \n", - "0 0.24 99.42 2799572 41151 99 metabat2 0 \n", - "1 0.22 92.73 2148097 12225 226 metabat2 1 \n", - "2 0.92 91.68 4266134 39217 157 metabat2 2 \n", - "3 1.53 81.34 1902761 11352 218 metabat2 3 \n", - "4 2.73 79.53 2980526 6876 502 metabat2 4 \n", - "5 3.16 76.89 1807028 7852 274 metabat2 5 \n", - "6 0.11 76.31 3477636 82084 71 metabat2 6 \n", - "7 5.77 60.24 1384653 4937 292 metabat2 7 \n", - "8 2.99 45.77 1707078 4929 362 metabat2 8 \n", - "0 0.09 99.82 4681369 82084 94 semibin2 0 \n", - "1 0.28 99.36 2937678 37523 113 semibin2 1 \n", - "2 0.14 93.15 2129295 12519 216 semibin2 2 \n", - "3 0.03 92.07 4162911 40395 139 semibin2 3 \n", - "4 2.25 78.59 1674156 8389 245 semibin2 4 \n", - "5 1.66 78.55 1820073 11737 205 semibin2 5 \n", - "6 1.63 76.99 2790948 7117 450 semibin2 6 \n", - "7 2.45 67.67 1245031 5061 253 semibin2 7 \n", - "8 1.33 50.68 1728690 4913 367 semibin2 8 \n", - "9 1.31 49.30 2609451 5292 511 semibin2 9 \n", - "\n", - " High quality bin completeness - 2*contamination \\\n", - "0 True 99.90 \n", - "1 True 99.50 \n", - "2 True 96.93 \n", - "3 True 95.42 \n", - "4 True 88.56 \n", - "5 True 87.81 \n", - "6 False 80.05 \n", - "7 False 68.11 \n", - "8 False 65.79 \n", - "9 False 57.44 \n", - "10 False 54.70 \n", - "11 False 49.54 \n", - "12 False 48.57 \n", - "13 False 42.21 \n", - "0 True 99.24 \n", - "1 True 99.08 \n", - "2 True 92.08 \n", - "3 True 85.79 \n", - "4 False 83.75 \n", - "6 False 64.83 \n", - "0 True 90.19 \n", - "1 True 86.86 \n", - "3 False 46.41 \n", - "0 True 99.42 \n", - "1 True 92.73 \n", - "2 True 91.68 \n", - "3 False 81.34 \n", - "4 False 79.53 \n", - "5 False 76.89 \n", - "6 False 76.31 \n", - "7 False 60.24 \n", - "8 False 45.77 \n", - "0 True 99.82 \n", - "1 True 99.36 \n", - "2 True 93.15 \n", - "3 True 92.07 \n", - "4 False 78.59 \n", - "5 False 78.55 \n", - "6 False 76.99 \n", - "7 False 67.67 \n", - "8 False 50.68 \n", - "9 False 49.30 \n", - "\n", - " Contamination ≤ 10 and
Completeness \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 90% \n", - "5 > 90% \n", - "6 > 70% and ≤ 90% \n", - "7 > 70% and ≤ 90% \n", - "8 > 70% and ≤ 90% \n", - "9 > 50% and ≤ 70% \n", - "10 > 50% and ≤ 70% \n", - "11 > 50% and ≤ 70% \n", - "12 > 50% and ≤ 70% \n", - "13 > 50% and ≤ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 70% and ≤ 90% \n", - "6 > 70% and ≤ 90% \n", - "0 > 90% \n", - "1 > 90% \n", - "3 > 50% and ≤ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 70% and ≤ 90% \n", - "4 > 70% and ≤ 90% \n", - "5 > 70% and ≤ 90% \n", - "6 > 70% and ≤ 90% \n", - "7 > 70% and ≤ 90% \n", - "8 > 50% and ≤ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 70% and ≤ 90% \n", - "5 > 70% and ≤ 90% \n", - "6 > 70% and ≤ 90% \n", - "7 > 70% and ≤ 90% \n", - "8 > 50% and ≤ 70% \n", - "9 > 50% and ≤ 70% " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "contamination_cutoff = 10\n", - "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", - "high_completeness_filt = df_bins['completeness'] > 90\n", - "medium_completeness_filt = df_bins['completeness'] > 70\n", - "low_completeness_filt = df_bins['completeness'] > 50\n", - "\n", - "quality = f'Contamination ≤ {contamination_cutoff} and
Completeness'\n", - "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≤ 70%'\n", - "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≤ 90%'\n", - "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", - "df_bins.loc[~df_bins[quality].isna()]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fa71ff37-9846-4826-a4bb-6c4b0069cea0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1621,23 +848,45 @@ } ], "source": [ - "df_bins.groupby([quality, 'tool']).value_counts(ascending=True).reset_index()\n", + "# Define the contamination cutoff\n", + "contamination_cutoff = 10\n", + "\n", + "# Create filters for completeness categories\n", + "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", + "high_completeness_filt = df_bins['completeness'] > 90\n", + "medium_completeness_filt = df_bins['completeness'] > 70\n", + "low_completeness_filt = df_bins['completeness'] > 50\n", "\n", + "# Define quality categories\n", + "quality = f'Contamination ≤ {contamination_cutoff} and
Completeness'\n", + "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≤ 70%'\n", + "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≤ 90%'\n", + "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", + "\n", + "# Group and count bins by quality category and tool\n", "df_bins_quality_grouped = df_bins.groupby([quality, 'tool']).agg(bin_count=('bin_id', 'count')).reset_index()\n", "df_bins_quality_grouped" ] }, + { + "cell_type": "markdown", + "id": "6eec391a", + "metadata": {}, + "source": [ + "Now, let’s create a bar plot to visualize the number of bins in each quality category for each bin sets:" + ] + }, { "cell_type": "code", "execution_count": 8, - "id": "250def29-167e-4a3b-8194-282f602945c8", + "id": "36ce51ac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -1645,22 +894,42 @@ } ], "source": [ - "color_discrete_map={\"> 90%\": px.colors.qualitative.Prism[4],\n", - " \"> 70% and ≤ 90%\": px.colors.qualitative.Prism[2],\n", - " \"> 50% and ≤ 70%\": px.colors.qualitative.Prism[6]}\n", + "# Define colors for each completeness category\n", + "color_discrete_map = {\n", + " \"> 90%\": px.colors.qualitative.Prism[4],\n", + " \"> 70% and ≤ 90%\": px.colors.qualitative.Prism[2],\n", + " \"> 50% and ≤ 70%\": px.colors.qualitative.Prism[6]\n", + "}\n", "\n", - "fig = px.bar(df_bins_quality_grouped, x='tool', y=\"bin_count\", color=quality,\n", - " barmode='stack', color_discrete_map=color_discrete_map, text=\"bin_count\",\n", - " category_orders={\"tool\":[\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", - " opacity = 0.9)#[ \"#008c8a\", px.colors.qualitative.Safe[4], '#2596be'])\n", + "# Create the bar plot\n", + "fig = px.bar(\n", + " df_bins_quality_grouped, \n", + " x='tool', \n", + " y=\"bin_count\", \n", + " color=quality,\n", + " barmode='stack', \n", + " color_discrete_map=color_discrete_map, \n", + " text=\"bin_count\",\n", + " category_orders={\"tool\": [\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", + " opacity=0.9\n", + ")\n", "\n", + "# Update layout for better appearance\n", "fig.update_layout(\n", - " width=800,\n", - " height=500,\n", - " legend=dict(\n", - " traceorder=\"reversed\",\n", - " ))\n", - "fig" + " width=600,\n", + " height=500,\n", + " legend=dict(traceorder=\"reversed\")\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f78d0f29", + "metadata": {}, + "source": [ + "From the plot, you can see that Binette produces more high-quality bins compared to the initial bin sets! 🎉" ] } ], @@ -1680,7 +949,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.19" } }, "nbformat": 4, diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index c7104ed..d2084bb 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -18,9 +18,28 @@ This process takes approximately 28 minutes to complete. ```{admonition} Note :class: note -You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. Refer to the CAMI benchmark for a detailed comparison. +You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. ``` + +```{admonition} Best Practices +:class: tip + +Here are some general tips that might help improve your assembly results, depending on your data: + +- **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. + +- **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. + +- **Assembly Filtering:** After assembling, it’s often a good idea to filter out small or low-coverage contigs. + + +These steps aren’t mandatory, and since this tutorial focuses on binning and using Binette, we’ll skip them for now. + +``` + + + ## Align the Reads Over the Assembly To get coverage information, we first need to map the reads back to the assembly. diff --git a/docs/tutorial/binette_results/final_bins_quality_reports.tsv b/docs/tutorial/binette_results/final_bins_quality_reports.tsv new file mode 100644 index 0000000..f575212 --- /dev/null +++ b/docs/tutorial/binette_results/final_bins_quality_reports.tsv @@ -0,0 +1,16 @@ +bin_id origin name completeness contamination score size N50 contig_count +17075 diff 44 - 10 100.0 0.05 99.9 4672665 82084 93 +39427 diff 36 - 6 99.9 0.2 99.5 2796605 41151 98 +47060 union 58 | 33 98.59 0.83 96.93 4601336 41016 165 +47177 union 91 | 25 | 55 96.1 0.34 95.41999999999999 2598718 11891 312 +21248 diff 65 - 8 - 28 91.98 1.71 88.56 1768095 9976 250 +44137 diff 76 - 13 - 28 92.63 2.41 87.81 3726254 5669 850 +31703 diff 31 - 7 - 61 81.73 0.84 80.05 1665233 8518 248 +13475 diff 47 - 37 72.89 2.39 68.11 1241829 5061 252 +47926 union 75 | 30 74.31 4.26 65.79 3293949 2954 1262 +46775 union 42 | 102 62.94 2.75 57.44 1293571 3783 419 +33569 diff 83 - 7 - 38 - 31 59.18 2.24 54.7 2042527 4437 514 +39350 diff 57 - 16 - 75 52.16 1.31 49.54 2601282 5332 509 +39558 diff 78 - 6 - 43 64.63 8.03 48.56999999999999 1858210 1430 1293 +51082 union 120 | 1 52.33 5.06 42.21 688879 1446 472 +19689 diff 118 - 18 - 61 - 31 48.22 8.23 31.759999999999998 1782676 1402 1265 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv new file mode 100644 index 0000000..f4a995a --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv @@ -0,0 +1,64 @@ +bin_id origin name completeness contamination score size N50 contig_count +125 concoct/bins 9.fa 100.0 0.38 99.24 3033586 37523 131 +67 concoct/bins 41.fa 100.0 0.46 99.08 4765466 82084 101 +91 concoct/bins 7.fa 92.76 0.34 92.08 2274951 12187 265 +76 concoct/bins 6.fa 92.63 3.42 85.78999999999999 3751950 5674 855 +65 concoct/bins 62.fa 87.35 1.8 83.75 1917859 10911 259 +109 concoct/bins 31.fa 100.0 14.25 71.5 7431952 25567 536 +75 concoct/bins 48.fa 73.35 4.26 64.83 3285374 2950 1261 +78 concoct/bins 2.fa 69.37 13.16 43.050000000000004 2385110 1471 1615 +113 concoct/bins 39.fa 46.66 3.76 39.14 809087 1511 527 +120 concoct/bins 29.fa 35.92 0.57 34.78 480789 1467 325 +86 concoct/bins 51.fa 45.9 7.36 31.18 4821129 1550 3079 +117 concoct/bins 20.fa 32.76 1.09 30.58 717576 1577 464 +118 concoct/bins 11.fa 47.24 9.31 28.62 1944164 1431 1340 +83 concoct/bins 26.fa 99.96 37.25 25.459999999999994 3984942 5809 809 +104 concoct/bins 38.fa 25.03 1.18 22.67 445168 1362 322 +119 concoct/bins 19.fa 35.1 7.86 19.380000000000003 2403536 1599 1464 +82 concoct/bins 59.fa 100.0 40.63 18.739999999999995 8287537 3951 2241 +89 concoct/bins 3.fa 87.66 34.5 18.659999999999997 6341799 1568 4092 +121 concoct/bins 30.fa 87.47 34.51 18.450000000000003 3480539 4299 901 +102 concoct/bins 0.fa 17.11 0.07 16.97 344299 1699 211 +97 concoct/bins 43.fa 12.5 0.15 12.2 345166 1384 238 +110 concoct/bins 35.fa 8.66 0.01 8.64 483789 1273 355 +108 concoct/bins 52.fa 6.65 0.0 6.65 89878 2232 40 +63 concoct/bins 27.fa 6.62 0.0 6.62 1016 1016 1 +101 concoct/bins 24.fa 6.45 0.0 6.45 3381 1107 3 +124 concoct/bins 18.fa 6.48 0.02 6.44 193358 1267 148 +106 concoct/bins 36.fa 6.43 0.0 6.43 12090 1997 7 +123 concoct/bins 14.fa 6.38 0.0 6.38 4193 3113 2 +114 concoct/bins 60.fa 6.38 0.0 6.38 8476 2499 5 +93 concoct/bins 5.fa 6.38 0.0 6.38 5082 1686 3 +87 concoct/bins 28.fa 6.38 0.0 6.38 5015 1668 3 +80 concoct/bins 13.fa 6.38 0.0 6.38 5338 1601 3 +112 concoct/bins 50.fa 6.38 0.0 6.38 13671 1587 9 +96 concoct/bins 54.fa 6.38 0.0 6.38 2727 1576 2 +107 concoct/bins 58.fa 6.38 0.0 6.38 1491 1491 1 +74 concoct/bins 45.fa 6.38 0.0 6.38 2475 1448 2 +70 concoct/bins 22.fa 6.38 0.0 6.38 1344 1344 1 +116 concoct/bins 10.fa 6.38 0.0 6.38 2524 1332 2 +98 concoct/bins 25.fa 6.38 0.0 6.38 10545 1304 8 +71 concoct/bins 32.fa 6.38 0.0 6.38 2290 1266 2 +92 concoct/bins 57.fa 6.38 0.0 6.38 4999 1246 4 +105 concoct/bins 34.fa 6.38 0.0 6.38 1240 1240 1 +66 concoct/bins 23.fa 6.38 0.0 6.38 1236 1236 1 +72 concoct/bins 1.fa 6.38 0.0 6.38 12304 1223 10 +88 concoct/bins 53.fa 6.38 0.0 6.38 1160 1160 1 +68 concoct/bins 4.fa 6.38 0.0 6.38 6739 1136 6 +64 concoct/bins 37.fa 6.38 0.0 6.38 1123 1123 1 +94 concoct/bins 33.fa 6.38 0.0 6.38 1032 1032 1 +69 concoct/bins 17.fa 6.05 0.0 6.05 8012 1402 6 +85 concoct/bins 55.fa 5.85 0.0 5.85 117297 100818 12 +79 concoct/bins 40.fa 5.58 0.0 5.58 16429 1658 10 +99 concoct/bins 8.fa 5.35 0.0 5.35 98557 1192 80 +122 concoct/bins 61.fa 10.6 3.22 4.159999999999999 173292 1225 136 +77 concoct/bins 42.fa 3.74 0.0 3.74 122021 3383 50 +90 concoct/bins 15.fa 3.68 0.01 3.66 106174 24244 6 +111 concoct/bins 49.fa 3.6 0.02 3.56 75967 2458 39 +84 concoct/bins 46.fa 3.32 0.0 3.32 55857 1166 47 +115 concoct/bins 16.fa 3.21 0.0 3.21 36685 1138 31 +100 concoct/bins 21.fa 2.98 0.01 2.96 20489 1588 12 +95 concoct/bins 56.fa 2.73 0.0 2.73 28603 1276 21 +73 concoct/bins 47.fa 2.67 0.0 2.67 48903 2372 23 +103 concoct/bins 12.fa 2.53 0.0 2.53 41153 1182 34 +81 concoct/bins 44.fa 2.5 0.0 2.5 44603 1410 30 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv new file mode 100644 index 0000000..d515b9d --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv @@ -0,0 +1,24 @@ +bin_id origin name completeness contamination score size N50 contig_count +22 maxbin2 maxbin2.001.fasta 99.81 4.81 90.19 4616818 89436 133 +14 maxbin2 maxbin2.002.fasta 93.92 3.53 86.86 2874373 37523 195 +11 maxbin2 maxbin2.006.fasta 75.2 12.31 50.58 2634516 12326 626 +5 maxbin2 maxbin2.009.fasta 62.69 8.14 46.41 2438492 6141 604 +18 maxbin2 maxbin2.012.fasta 56.93 14.12 28.69 3473782 2619 1410 +7 maxbin2 maxbin2.011.fasta 40.68 7.03 26.619999999999997 2087133 6988 510 +23 maxbin2 maxbin2.010.fasta 45.34 9.62 26.100000000000005 4743354 1971 2401 +3 maxbin2 maxbin2.018.fasta 80.35 27.53 25.289999999999992 5331237 4487 1756 +13 maxbin2 maxbin2.013.fasta 69.31 22.06 25.190000000000005 3958158 5259 1353 +21 maxbin2 maxbin2.007.fasta 34.6 4.79 25.020000000000003 1586278 12519 451 +6 maxbin2 maxbin2.021.fasta 42.81 9.69 23.430000000000003 1690737 2715 767 +19 maxbin2 maxbin2.020.fasta 27.99 2.4 23.189999999999998 1033153 3328 310 +20 maxbin2 maxbin2.014.fasta 26.95 2.05 22.85 1112378 1806 570 +10 maxbin2 maxbin2.008.fasta 56.41 17.23 21.949999999999996 3237421 2381 1425 +2 maxbin2 maxbin2.003.fasta 23.72 1.41 20.9 1419869 2539 575 +17 maxbin2 maxbin2.019.fasta 76.19 29.54 17.11 2765576 3328 1163 +1 maxbin2 maxbin2.023.fasta 27.26 6.52 14.220000000000002 454808 1432 314 +9 maxbin2 maxbin2.004.fasta 17.07 1.62 13.83 1180579 2361 491 +4 maxbin2 maxbin2.022.fasta 28.6 7.98 12.64 804525 1593 497 +8 maxbin2 maxbin2.005.fasta 11.11 0.04 11.03 488546 17602 45 +15 maxbin2 maxbin2.015.fasta 10.27 0.56 9.149999999999999 379048 3202 126 +16 maxbin2 maxbin2.016.fasta 4.92 0.0 4.92 103037 3558 49 +12 maxbin2 maxbin2.017.fasta 93.2 48.33 -3.4599999999999937 4710071 2372 2074 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv new file mode 100644 index 0000000..fdc6bdd --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv @@ -0,0 +1,15 @@ +bin_id origin name completeness contamination score size N50 contig_count +36 metabat2 metabat2.14.fa 99.9 0.24 99.42 2799572 41151 99 +25 metabat2 metabat2.8.fa 93.17 0.22 92.73 2148097 12225 226 +33 metabat2 metabat2.12.fa 93.52 0.92 91.67999999999999 4266134 39217 157 +27 metabat2 metabat2.11.fa 84.4 1.53 81.34 1902761 11352 218 +37 metabat2 metabat2.1.fa 84.99 2.73 79.53 2980526 6876 502 +31 metabat2 metabat2.2.fa 83.21 3.16 76.88999999999999 1807028 7852 274 +35 metabat2 metabat2.4.fa 76.53 0.11 76.31 3477636 82084 71 +29 metabat2 metabat2.7.fa 71.78 5.77 60.24 1384653 4937 292 +24 metabat2 metabat2.3.fa 51.75 2.99 45.769999999999996 1707078 4929 362 +30 metabat2 metabat2.13.fa 44.85 0.49 43.870000000000005 1724699 4259 415 +26 metabat2 metabat2.10.fa 44.15 1.11 41.93 982239 4743 219 +32 metabat2 metabat2.5.fa 25.31 0.03 25.25 1077467 91995 14 +28 metabat2 metabat2.9.fa 98.03 37.1 23.83 8543557 4347 1974 +34 metabat2 metabat2.6.fa 7.06 0.03 7.0 252404 64012 6 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv new file mode 100644 index 0000000..c3a150b --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv @@ -0,0 +1,26 @@ +bin_id origin name completeness contamination score size N50 contig_count +44 semibin2/output_bins SemiBin_27.fa.gz 100.0 0.09 99.82 4681369 82084 94 +53 semibin2/output_bins SemiBin_33.fa.gz 99.92 0.28 99.36 2937678 37523 113 +50 semibin2/output_bins SemiBin_10.fa.gz 93.43 0.14 93.15 2129295 12519 216 +62 semibin2/output_bins SemiBin_24.fa.gz 92.13 0.03 92.07 4162911 40395 139 +38 semibin2/output_bins SemiBin_26.fa.gz 83.09 2.25 78.59 1674156 8389 245 +49 semibin2/output_bins SemiBin_32.fa.gz 81.87 1.66 78.55000000000001 1820073 11737 205 +60 semibin2/output_bins SemiBin_22.fa.gz 80.25 1.63 76.99 2790948 7117 450 +47 semibin2/output_bins SemiBin_11.fa.gz 72.57 2.45 67.66999999999999 1245031 5061 253 +61 semibin2/output_bins SemiBin_3.fa.gz 53.34 1.33 50.68000000000001 1728690 4913 367 +57 semibin2/output_bins SemiBin_12.fa.gz 51.92 1.31 49.300000000000004 2609451 5292 511 +56 semibin2/output_bins SemiBin_17.fa.gz 47.29 0.37 46.55 1934420 4160 470 +42 semibin2/output_bins SemiBin_14.fa.gz 47.28 0.73 45.82 990463 4692 222 +51 semibin2/output_bins SemiBin_13.fa.gz 36.67 6.12 24.43 1699695 4402 395 +54 semibin2/output_bins SemiBin_18.fa.gz 17.07 0.69 15.690000000000001 1131272 3943 277 +59 semibin2/output_bins SemiBin_15.fa.gz 14.04 1.01 12.02 884790 4349 206 +45 semibin2/output_bins SemiBin_20.fa.gz 9.95 0.01 9.93 515894 8389 67 +43 semibin2/output_bins SemiBin_5.fa.gz 9.95 0.05 9.85 513202 3891 131 +39 semibin2/output_bins SemiBin_35.fa.gz 9.45 0.0 9.45 213606 3336 63 +58 semibin2/output_bins SemiBin_84.fa.gz 8.7 0.0 8.7 358311 64012 9 +55 semibin2/output_bins SemiBin_66.fa.gz 8.66 0.19 8.28 290297 6707 44 +48 semibin2/output_bins SemiBin_52.fa.gz 8.28 0.01 8.26 358822 3296 106 +41 semibin2/output_bins SemiBin_19.fa.gz 8.12 0.02 8.08 353499 3949 90 +52 semibin2/output_bins SemiBin_6.fa.gz 7.74 0.01 7.720000000000001 351540 4284 85 +46 semibin2/output_bins SemiBin_37.fa.gz 6.18 0.0 6.18 250833 3607 66 +40 semibin2/output_bins SemiBin_80.fa.gz 4.41 0.13 4.15 217541 3425 64 diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index 8624cf6..bace377 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,10 +1,50 @@ + +## Align the Reads to the Assembly + +Binning tools rely on coverage information, among other criteria, to evaluate each contig. + +To obtain this coverage data, we first need to map the reads back to the assembly. + +```{code-block} bash +# Create a directory for the alignments +mkdir -p alignments_bwa/ + +# Index the contigs file using BWA-MEM2 +bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa + +# Map reads back to the assembly, convert to BAM format, and sort +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ +samtools view -@ 12 -bS - | \ +samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam + +# Index the BAM file +samtools index alignments_bwa/Kickstart.bam +``` + + +:::{admonition} ⌛ Expected Time +:class: note + +This process takes approximately 12 minutes to complete. +::: + +```{admonition} +:class: tip + +If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. + +If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. 🚀 + +``` + + ## Run Binning Tools -In this section, we'll use different binning tools to group contigs of assembly. +Let's use different binning tools to group the contigs into bins, which we'll refine in the next section with Binette. ### MetaBAT2 -First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used for MaxBin2. +First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used by MaxBin2. ```bash jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index d21931c..4493b3e 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,59 +1,30 @@ # Tutorial -In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs. Finally, we'll use Binette to refine these bins and improve our results. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs into bins. Finally, we'll use Binette to refine these bins. ```{mermaid} ---- -title: "Tutorial Overview:" -align: center ---- - -%%{init: {'theme':'default'}}%% - -graph LR - - A[Download Metagenomics Reads] --> B - B[Assemble Reads into Contigs] --> c - subgraph Pangenome creation - a:::workflow - c:::workflow - g:::workflow - p:::workflow - a("annotate") --> c - c(cluster) --> g(graph) - g(graph) --> p(partition) - end - - - C[Bin Contigs with Binning Tools] --> D[Refine Bins with Binette] - - - classDef panrgp fill:#4066d4 - classDef panmodule fill:#d44066 - classDef workflow fill:#d4ae40 - - -``` - -```{mermaid} - ---- +--- title: "Tutorial Overview:" align: center + +config: + look: handDrawn + theme: neutral --- graph TD - i[Get Metagenomics Reads] --> B[Assembly & Reads alignment] + i[metagenomics reads] --> B[assembly] - B --> metabat2 --> r[Binette] + B --> metabat2 --> r[binette] B --> maxbin2 --> r B --> concoct --> r B --> semibin2 --> r + r --> f[final bins] subgraph Binning metabat2:::binning @@ -62,8 +33,8 @@ graph TD semibin2:::binning end - - classDef binning fill:#d4ae40 + + classDef binning fill:#d4ae40 ``` @@ -79,6 +50,7 @@ assembly binning binette analyse_binette_result.ipynb +analyse_binette_result.myst ``` From 25443d416cba4ef2d66f985c8de5793de46e8caa Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 13:57:37 +0200 Subject: [PATCH 09/20] improve doc --- docs/api/api_ref.md | 1 - docs/api/binette.md | 13 +--- docs/api/modules.md | 7 -- docs/conf.py | 4 +- docs/contributing.md | 2 +- docs/tests.md | 2 +- docs/tutorial/assembly.md | 31 ++++++--- docs/tutorial/binette.md | 2 +- docs/tutorial/binette_tutorial_env.yaml | 21 ++++++ docs/tutorial/binning.md | 51 ++------------ docs/tutorial/get_dataset.md | 88 +++++++++++++++++++++++++ docs/tutorial/set_environment.md | 12 ++-- docs/tutorial/tutorial_main.md | 21 +++--- docs/usage.md | 4 +- 14 files changed, 162 insertions(+), 97 deletions(-) delete mode 100644 docs/api/modules.md create mode 100644 docs/tutorial/binette_tutorial_env.yaml create mode 100644 docs/tutorial/get_dataset.md diff --git a/docs/api/api_ref.md b/docs/api/api_ref.md index 3de18e1..58bea10 100644 --- a/docs/api/api_ref.md +++ b/docs/api/api_ref.md @@ -3,6 +3,5 @@ ```{toctree} :maxdepth: 2 binette -indice_and_table ``` diff --git a/docs/api/binette.md b/docs/api/binette.md index bc3c754..a4e2c9a 100644 --- a/docs/api/binette.md +++ b/docs/api/binette.md @@ -20,15 +20,6 @@ :show-inheritance: ``` -## binette.binette module - -```{eval-rst} -.. automodule:: binette.binette - :members: - :undoc-members: - :show-inheritance: -``` - ## binette.cds module ```{eval-rst} @@ -65,10 +56,10 @@ :show-inheritance: ``` -## Module contents +## binette.main module ```{eval-rst} -.. automodule:: binette +.. automodule:: binette.main :members: :undoc-members: :show-inheritance: diff --git a/docs/api/modules.md b/docs/api/modules.md deleted file mode 100644 index b83d27c..0000000 --- a/docs/api/modules.md +++ /dev/null @@ -1,7 +0,0 @@ -# binette - -```{toctree} -:maxdepth: 4 - -binette -``` diff --git a/docs/conf.py b/docs/conf.py index 245781e..0cce3b6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,13 +24,13 @@ "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", 'sphinx_search.extension', + 'sphinx_togglebutton', # "myst_nb", "myst_parser", 'nbsphinx', 'nbsphinx_link', # 'sphinx.ext.napoleon', # 'sphinx.ext.viewcode', - "myst_parser", 'sphinxcontrib.mermaid' ] myst_enable_extensions = [ @@ -61,7 +61,7 @@ # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api", "jupyter_execute"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "jupyter_execute"] diff --git a/docs/contributing.md b/docs/contributing.md index f400a68..e9b69ee 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -26,7 +26,7 @@ For minor changes like fixing typos or making small edits, create a new Pull Req - Clone your forked repository to your local machine. 2. **Get an Environment:** - Create an environment with all Binette prerequisites installed by following the installation instructions [here](./installation.md#installing-from-source-code-within-a-conda-environnement). + Create an environment with all Binette prerequisites installed by following the installation instructions [here](./installation.md#from-the-source-code-within-a-conda-environnement). 3. **Install in Editable Mode:** To enable code editing and testing of new functionality, you can install Binette in editable mode using the following command: diff --git a/docs/tests.md b/docs/tests.md index b6c0e60..ed1cdd7 100644 --- a/docs/tests.md +++ b/docs/tests.md @@ -8,7 +8,7 @@ Tests have been implemented to ensure the correctness of Binette. Unit tests have been implmented in the tests directory using pytest. -To run the test suit you would need to have install Binette from the source code. For that, you can follow installation instructions [here](./installation.md#installing-from-source-code-within-a-conda-environnement). +To run the test suit you would need to have install Binette from the source code. For that, you can follow installation instructions [here](./installation.md#from-the-source-code-within-a-conda-environnement). To install pytest in you environement you can run : diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index d2084bb..f919f00 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -10,39 +10,40 @@ megahit -1 coal-metagenomics/Kickstart_1.fastq.gz \ :::{admonition} ⌛ Expected Time :class: note +:class: dropdown This process takes approximately 28 minutes to complete. ::: -```{admonition} Note -:class: note - -You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. -``` - -```{admonition} Best Practices +```{admonition} Assembly tips :class: tip +:class: dropdown Here are some general tips that might help improve your assembly results, depending on your data: - **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. +- **Use SPAdes rather than MEGAHIT** **SPAdes** generally performs better than MEGAHIT but takes longer and requires more memory. + - **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. - **Assembly Filtering:** After assembling, it’s often a good idea to filter out small or low-coverage contigs. -These steps aren’t mandatory, and since this tutorial focuses on binning and using Binette, we’ll skip them for now. +These steps aren’t mandatory, and since this tutorial focuses on binning refinement with Binette, we’ll skip them. ``` -## Align the Reads Over the Assembly -To get coverage information, we first need to map the reads back to the assembly. +## Align the Reads to the Assembly + +Binning tools rely on coverage information, among other criteria, to evaluate each contig. + +To obtain this coverage data, we first need to map the reads back to the assembly. ```{code-block} bash # Create a directory for the alignments @@ -63,6 +64,16 @@ samtools index alignments_bwa/Kickstart.bam :::{admonition} ⌛ Expected Time :class: note +:class: dropdown This process takes approximately 12 minutes to complete. ::: + +```{admonition} Read alignment strategy +:class: tip + +If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. + +If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. 🚀 + +``` \ No newline at end of file diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index 926ce3b..a2d7796 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -13,7 +13,7 @@ binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ Once Binette completes, the `binette_results` directory should have the following structure: -```plaintext +``` binette_results/ ├── final_bins │ ├── bin_13475.fa diff --git a/docs/tutorial/binette_tutorial_env.yaml b/docs/tutorial/binette_tutorial_env.yaml new file mode 100644 index 0000000..72655d3 --- /dev/null +++ b/docs/tutorial/binette_tutorial_env.yaml @@ -0,0 +1,21 @@ +name: binette_tutorial +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - uscdc-datasets-sars-cov-2 # Dataset downloader to get the tutorial initial data + - fastqc # Quality control for high-throughput sequencing data + - samtools=1 # Tools for manipulating sequencing data in SAM format + - bedtools=2 # Suite of tools for genome arithmetic + - bwa-mem2=2 # Align reads to a reference genome (using BWA-MEM2) + - megahit=1 # De novo assembler for large genomes + - maxbin2=2 # Binning tool for metagenomic datasets + - metabat2=2 # Binning tool for metagenomic datasets + - semibin=2 # Binning tool for metagenomic datasets + - concoct=1 # Binning tool for metagenomic datasets + - binette=1.0.1 # Binette for binning and genome analysis + - das_tool=1 # Bin refiner to compare with Binette + - jupyter # Jupyter notebook for interactive analysis + - pandas=1 # Data manipulation and analysis + - plotly=5 # Interactive graphing \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index bace377..cb82166 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,43 +1,3 @@ - -## Align the Reads to the Assembly - -Binning tools rely on coverage information, among other criteria, to evaluate each contig. - -To obtain this coverage data, we first need to map the reads back to the assembly. - -```{code-block} bash -# Create a directory for the alignments -mkdir -p alignments_bwa/ - -# Index the contigs file using BWA-MEM2 -bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa - -# Map reads back to the assembly, convert to BAM format, and sort -bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ -samtools view -@ 12 -bS - | \ -samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam - -# Index the BAM file -samtools index alignments_bwa/Kickstart.bam -``` - - -:::{admonition} ⌛ Expected Time -:class: note - -This process takes approximately 12 minutes to complete. -::: - -```{admonition} -:class: tip - -If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. - -If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. 🚀 - -``` - - ## Run Binning Tools Let's use different binning tools to group the contigs into bins, which we'll refine in the next section with Binette. @@ -108,14 +68,15 @@ extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.c You can also run SemiBin2 with its `single_easy_bin` command: -```{admonition} ⏳ Time Note -:class: note - -This process can take some time, so it may be skipped. -``` ```bash SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ -b alignments_bwa/Kickstart.bam \ -o semibin2/ -p 12 ``` + +```{admonition} ⏳ Time Note +:class: note + +This process can take some time. +``` diff --git a/docs/tutorial/get_dataset.md b/docs/tutorial/get_dataset.md new file mode 100644 index 0000000..3967bc3 --- /dev/null +++ b/docs/tutorial/get_dataset.md @@ -0,0 +1,88 @@ +## Obtaining Metagenomic Data for the Tutorial + +### Using the ncezid-biome Datasets Tool + +For this tutorial, we’ll use the "Kickstart" metagenome dataset from the [ncezid-biome datasets GitHub repository](https://github.com/ncezid-biome/). This dataset corresponds to sample [SAMN05024035](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa) and SRA [SRR5058924](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa). + + +We'll download the "Kickstart" dataset using the ncezid-biome datasets tool. You can find the tool and instructions on how to use it in their [GitHub repository](https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb). + +The tool called `uscdc-datasets-sars-cov-2` on bioconda is part of the Conda environment created in the [previous section](./set_environment.md). + + +#### Download the Kickstart Dataset + +Once the tool is installed, you can download the "Kickstart" dataset with the following steps: + +1. **Download the coal-metagenomics table** from the GitHub repository: + + ```{code-block} bash + wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv + ``` + +2. **Select the relevant line** corresponding to the "Kickstart" dataset (SRR5058924) by extracting the header and the specific entry: + + ```{code-block} bash + # Select the header of the table + head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv + + # Append the relevant line for the Kickstart dataset + grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv + ``` + +3. **Run the dataset download** using the `GenFSGopher.pl` script: + + ```{code-block} bash + GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics_Kickstart_only.tsv + ``` + + +:::{admonition} ⌛ Expected Time +:class: note + +This process takes approximately 16 minutes to complete. +::: + +#### Directory Structure + +After downloading, your directory structure should look like this: + +```{code-block} text +├── coal-metagenomics_Kickstart_only.tsv +└── data + ├── in.tsv + ├── Kickstart_1.fastq.gz + ├── Kickstart_1.fastq.sha256 + ├── Kickstart_2.fastq.gz + ├── Kickstart_2.fastq.sha256 + ├── Makefile + ├── prefetch.done + ├── sha256sum.log + ├── SRR5058924 + │   └── SRR5058924.sra + └── tree.dnd +``` + +In the next section, will assemble the two reads files to obtain an assembly of the dataset: +- `data/Kickstart_1.fastq.gz` +- `data/Kickstart_2.fastq.gz` + + +:::{admonition} 🧹 Cleaning Tip +:class: tip + +You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as it is no longer needed; we will use only the FASTQ files. To remove it, run: + +```{code-block} bash +rm data/SRR5058924/SRR5058924.sra +::: + +```{note} +Alternatively, you can download the data using the SRA Toolkit, which is what the ncezid-biome tool uses in the background. +Note that ncezid-biome tool provides additional checksum verification to ensure data integrity. +You can retrieve the data with the following commands after installing the SRA Toolkit (e.g., via Conda: [sra-tools on Anaconda](https://anaconda.org/bioconda/sra-tools)): +```{code-block} bash +prefetch SRR5058924 +fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra +``` + diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md index 52d6092..7f039fa 100644 --- a/docs/tutorial/set_environment.md +++ b/docs/tutorial/set_environment.md @@ -12,6 +12,13 @@ mamba env create -f binette_tutorial_env.yaml -n binette_tuto This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. +Below is the content of the `binette_tutorial_env.yaml` file: + +```{include} binette_tutorial_env.yaml +:code: yaml +``` + + ### Activate the Environment After the environment is created, activate it by running: @@ -20,8 +27,3 @@ After the environment is created, activate it by running: conda activate binette_tuto ``` -Below is the content of the `binette_tutorial_env.yaml` file: - -```{include} binette_tutorial_env.yaml -:code: yaml -``` diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 4493b3e..8cd8082 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -7,7 +7,7 @@ In this tutorial, we'll walk through a practical example of how to use Binette w --- title: "Tutorial Overview:" -align: center +align: right config: look: handDrawn @@ -17,20 +17,20 @@ config: graph TD - i[metagenomics reads] --> B[assembly] + i[Metagenomics reads] --> B[Assembly] - B --> metabat2 --> r[binette] - B --> maxbin2 --> r - B --> concoct --> r - B --> semibin2 --> r + B --> MetaBAT2 --> r[Binette] + B --> MaxBin2 --> r + B --> CONCOCT --> r + B --> SemiBin2 --> r r --> f[final bins] subgraph Binning - metabat2:::binning - maxbin2:::binning - concoct:::binning - semibin2:::binning + MetaBAT2:::binning + MaxBin2:::binning + CONCOCT:::binning + SemiBin2:::binning end @@ -50,7 +50,6 @@ assembly binning binette analyse_binette_result.ipynb -analyse_binette_result.myst ``` diff --git a/docs/usage.md b/docs/usage.md index 2108a50..063a51b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -11,7 +11,7 @@ For example, consider the following two `contig2bin_tables`: - `bin_set1.tsv`: - ```tsv + ``` contig_1 binA contig_8 binA contig_15 binB @@ -20,7 +20,7 @@ For example, consider the following two `contig2bin_tables`: - `bin_set2.tsv`: - ```tsv + ``` contig_1 bin.0 contig_8 bin.0 contig_15 bin.1 From 716e2e55cacb964f50f1fba19f54625b78ca3e83 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:19:11 +0200 Subject: [PATCH 10/20] update usage with input bin sets report new output --- docs/usage.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/usage.md b/docs/usage.md index 063a51b..9b37065 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -65,6 +65,7 @@ Binette results are stored in the `results` directory. You can specify a differe In this directory you will find: - `final_bins_quality_reports.tsv`: This is a TSV (tab-separated values) file containing quality information about the final selected bins. - `final_bins/`: This directory stores all the selected bins in fasta format. +- `input_bins_quality_reports/`: A directory storing quality reports for the input bin sets, with files following the same structure as `final_bins_quality_reports.tsv`. - `temporary_files/`: This directory contains intermediate files. If you choose to use the `--resume` option, Binette will utilize files in this directory to prevent the recomputation of time-consuming steps. From 3344a4548e41312eed143ed3d32d92bdf3c4e139 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:20:32 +0200 Subject: [PATCH 11/20] add missing sphinx_togglebutton extension --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8f12e86..3f99085 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ doc = [ "nbsphinx==0.9.5", "nbsphinx_link==1.3.0", "sphinx-book-theme==1.0.1", - "sphinxcontrib.mermaid" + "sphinxcontrib.mermaid", + "sphinx_togglebutton=0.3.2" ] dev = [ From eeee06e6e7678d91c7ef450863c209a4932317dc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:27:16 +0200 Subject: [PATCH 12/20] fix pip format for added ext --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3f99085..2769cd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ doc = [ "nbsphinx_link==1.3.0", "sphinx-book-theme==1.0.1", "sphinxcontrib.mermaid", - "sphinx_togglebutton=0.3.2" + "sphinx_togglebutton==0.3.2" ] dev = [ From 05696e8c8faaaa211a9269f230afd854296bcbfe Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:04:50 +0200 Subject: [PATCH 13/20] improve doc --- docs/tutorial/assembly.md | 1 - docs/tutorial/binette.md | 8 ++++++++ docs/tutorial/binning.md | 5 +++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index f919f00..60d25fa 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -19,7 +19,6 @@ This process takes approximately 28 minutes to complete. ```{admonition} Assembly tips :class: tip -:class: dropdown Here are some general tips that might help improve your assembly results, depending on your data: diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index a2d7796..5f83676 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -11,6 +11,14 @@ binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ --verbose -t 12 -o binette_results ``` +```{admonition} ⌛ Expected Time +:class: note +:class: dropdown + +This process should talke around 9 minutes to complete. +``` + + Once Binette completes, the `binette_results` directory should have the following structure: ``` diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index cb82166..b495db1 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -75,8 +75,9 @@ SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ -o semibin2/ -p 12 ``` -```{admonition} ⏳ Time Note +```{admonition} ⌛ Expected Time :class: note +:class: dropdown -This process can take some time. +This process take around 1 hour to complete. ``` From 8a31ecb840ecc3bececc6d05ee3326759616a257 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:05:44 +0200 Subject: [PATCH 14/20] add environement.yml for binder --- environment.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..dd03bfa --- /dev/null +++ b/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - defaults +dependencies: + - jupyter # Jupyter notebook for interactive analysis + - pandas=1 # Data manipulation and analysis + - plotly=5 # Interactive graphing + - nbgitpuller \ No newline at end of file From 46cd347cfd565d240aecd6731be33e8ac494da24 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:21:13 +0200 Subject: [PATCH 15/20] fix typo in assembly tuto doc --- docs/tutorial/assembly.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index 60d25fa..fdd1003 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -24,7 +24,7 @@ Here are some general tips that might help improve your assembly results, depend - **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. -- **Use SPAdes rather than MEGAHIT** **SPAdes** generally performs better than MEGAHIT but takes longer and requires more memory. +- **Use SPAdes rather than MEGAHIT:** SPAdes generally performs better than MEGAHIT but takes longer and requires more memory. - **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. From 2e5b7b04de8c1011cdd6e5cd1fe7eb8e20175523 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:21:39 +0200 Subject: [PATCH 16/20] use proper name for env --- docs/tutorial/set_environment.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md index 7f039fa..5d003dc 100644 --- a/docs/tutorial/set_environment.md +++ b/docs/tutorial/set_environment.md @@ -7,7 +7,7 @@ To get started, we'll download the necessary tools and set them up in a dedicate First, let's create a new Conda environment specifically for this tutorial: ```{code-block} bash -mamba env create -f binette_tutorial_env.yaml -n binette_tuto +mamba env create -f binette_tutorial_env.yaml -n binette_tutorial ``` This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. From 91c34528d14907d9d3ab4ec835b91ecf571fa5c0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 16:03:11 +0200 Subject: [PATCH 17/20] improve env file --- environment.yml => binder/environment.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename environment.yml => binder/environment.yml (61%) diff --git a/environment.yml b/binder/environment.yml similarity index 61% rename from environment.yml rename to binder/environment.yml index dd03bfa..4f9610f 100644 --- a/environment.yml +++ b/binder/environment.yml @@ -1,8 +1,8 @@ +name: binder_tutorial_env channels: - conda-forge - - defaults dependencies: - jupyter # Jupyter notebook for interactive analysis - - pandas=1 # Data manipulation and analysis + - pandas # Data manipulation and analysis - plotly=5 # Interactive graphing - - nbgitpuller \ No newline at end of file + # - nbgitpuller \ No newline at end of file From b5bd70b705f77d91705f5375a5dbb9eb7a1765db Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:04:49 +0200 Subject: [PATCH 18/20] try binder build from requirements.txt --- binder/environment.yml | 8 -------- binder/requirements.txt | 3 +++ 2 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 binder/environment.yml create mode 100644 binder/requirements.txt diff --git a/binder/environment.yml b/binder/environment.yml deleted file mode 100644 index 4f9610f..0000000 --- a/binder/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: binder_tutorial_env -channels: - - conda-forge -dependencies: - - jupyter # Jupyter notebook for interactive analysis - - pandas # Data manipulation and analysis - - plotly=5 # Interactive graphing - # - nbgitpuller \ No newline at end of file diff --git a/binder/requirements.txt b/binder/requirements.txt new file mode 100644 index 0000000..8b0c82c --- /dev/null +++ b/binder/requirements.txt @@ -0,0 +1,3 @@ +jupyter # Jupyter notebook for interactive analysis +pandas # Data manipulation and analysis +plotly # Interactive graphing From 8719f3c1b4bb621ee502ef6c0f8870a5b3526cbc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:17:18 +0200 Subject: [PATCH 19/20] improve tutorial --- docs/tutorial/get_dataset.md | 8 ++++---- docs/tutorial/tutorial_main.md | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/tutorial/get_dataset.md b/docs/tutorial/get_dataset.md index 3967bc3..f099b17 100644 --- a/docs/tutorial/get_dataset.md +++ b/docs/tutorial/get_dataset.md @@ -37,11 +37,11 @@ Once the tool is installed, you can download the "Kickstart" dataset with the fo ``` -:::{admonition} ⌛ Expected Time -:class: note + :::{admonition} ⌛ Expected Time + :class: note -This process takes approximately 16 minutes to complete. -::: + This process takes approximately 16 minutes to complete. + ::: #### Directory Structure diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 8cd8082..1fdc365 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,7 +1,11 @@ # Tutorial -In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs into bins. Finally, we'll use Binette to refine these bins. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. + + 1. We'll start by downloading metagenomics reads and then assemble these reads into contigs. + 2. Next, we'll use different binning tools to group the contigs into bins. + 3. Finally, we'll use Binette to refine these bins. ```{mermaid} From 8aec08230b9a1921b588ad7ce4d297cc146ecd1f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:18:40 +0200 Subject: [PATCH 20/20] remvoe binder env as it does not work --- binder/requirements.txt | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 binder/requirements.txt diff --git a/binder/requirements.txt b/binder/requirements.txt deleted file mode 100644 index 8b0c82c..0000000 --- a/binder/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -jupyter # Jupyter notebook for interactive analysis -pandas # Data manipulation and analysis -plotly # Interactive graphing
Contamination ≤ 10 and<br>Completenesstoolbin_count
0> 50% and ≤ 70%binette5
1> 50% and ≤ 70%maxbin21
2> 50% and ≤ 70%metabat21
3> 50% and ≤ 70%semibin22
4> 70% and ≤ 90%binette3
5> 70% and ≤ 90%concoct2> 70% and ≤ 90%concoct2
6