Adds code for version 4.0.2

biobakery · Sep 22, 2022 · b78e5b8 · b78e5b8
1 parent 40549a0
commit b78e5b8
Show file tree

Hide file tree

Showing 79 changed files with 7,755 additions and 154 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,34 @@
+## Version 4.0.2 (Sep 22nd, 2022)
+### New features
+* [MetaPhlAn] The new `--subsampling` parameter allows reads' subsampling on the flight
+* [MetaPhlAn] The new `--subsampling_seed` parameter enables a deterministic or randomized subsampling of the reads
+* [MetaPhlAn] The new `--gtdb_profiles` of the `merge_metaphlan_profiles.tsv` allows the merge of GTDB-based MetaPhlAn profiles
+* [StrainPhlAn] The new `--breadth_thres` parameter allows StrainPhlAn to filter the consensus markers sequences after the execution of `sample2markers.py`
+* [StrainPhlAn] Interactive selection of the available SGBs when the clade is specified at the species level
+* [StrainPhlAn] The new `--non_interactive` parameter disables user interaction when running StrainPhlAn 
+* [StrainPhlAn] The new `--abs_n_markers_thres` and `--abs_n_samples_thres` parameters enables the specification of the samples/markers filtering thresholds in absolute numbers 
+* [StrainPhlAn] The new `--treeshrink` parameter enables StrainPhlAn to run TreeShrink for outlier removal in the tree 
+* [StrainPhlAn] Addition of the `VallesColomerM_2022_Jan21_thresholds.tsv` for compatibility with the mpa_vJan21 database
+* [StrainPhlAn] The new `--clades` parameter enables `sample2markers.py` to restrict the reconstruction of markers to the specified clades
+
+### Changed features
+* [StrainPhlAn] The `-c` parameter of the `extract_markers.py` script now allows the specification of multiple clades
+* [StrainPhlAn] The `--print_clades_only` parameter now produces an output `print_clades_only.tsv` report
+* [StrainPhlAn] Compatibility with clade markers compressed in bz2 format
+* [StrainPhlAn] The `strain_transmission.py` script now uses by the default the `VallesColomerM_2022_Jan21_thresholds.tsv` thresholds
+### Fixes
+* [MetaPhlAn] `metaphlan2krona.py` and `hclust2` have been added to the bioconda recipe
+
+<br/>
+
 ## Version 4.0.1 (Aug 25th, 2022)
 ### New features
-* The new `--offline` parameter stops MetaPhlAn from automatically checking for updates
+* [MetaPhlAn] The new `--offline` parameter stops MetaPhlAn from automatically checking for updates
 ### Changed features
-* Improved StrainPhlAn's gaps management with the newest version of PhyloPhlAn (version 3.0.3)
-* Improved set of colors for the `plot_tree_graphlan.py script`
+* [StrainPhlAn] Improved StrainPhlAn's gaps management with the newest version of PhyloPhlAn (version 3.0.3)
+* [StrainPhlAn] Improved set of colors for the `plot_tree_graphlan.py script`
 ### Fixes
-* Fixes `KeyError: 't'` error when running MetaPhlAn with the `--CAMI_format_output` parameter
+* [MetaPhlAn] Fixes `KeyError: 't'` error when running MetaPhlAn with the `--CAMI_format_output` parameter
 
 <br/>
 
@@ -16,9 +39,9 @@
 * Ability to profile 21,978 known (kSGBs) and 4,992 unknown (uSGBs) microbial species
 * Better representation of, not only the human gut microbiome but also many other animal and ecological environments
 ### New features
-* Compatibility with MetaPhlAn 3 databases with parameter `--mpa3`
+* [MetaPhlAn] Compatibility with MetaPhlAn 3 databases with parameter `--mpa3`
 ### Changed features
-* Estimation of metagenome composed by microbes not included in the database with parameter `--unclassified_estimation`
+* [MetaPhlAn] Estimation of metagenome composed by microbes not included in the database with parameter `--unclassified_estimation`
 
 <br/>
 
@@ -29,7 +52,7 @@
 * Marker genes for a subset of existing bioBakery 3 species were also revised
 * Most existing bioBakery 3 species pangenomes were updated with revised or expanded gene content
 ### Changed features
-* MetaPhlAn 3.1 software has been updated to work with revised marker database
+* [MetaPhlAn] MetaPhlAn 3.1 software has been updated to work with revised marker database
 
 <br/>
 

diff --git a/bioconda_recipe/meta.yaml b/bioconda_recipe/meta.yaml
@@ -1,9 +1,9 @@
 {% set name = "metaphlan" %}
-{% set version = "4.0" %}
+{% set version = "4.0.2" %}
 
 package:
-  name: {{ name }}
-  version: {{ version }}
+  name: metaphlan
+  version: 4.0.2
 
 source:
   # url: https://github.com/biobakery/MetaPhlAn/archive/{{version}}.tar.gz
@@ -40,19 +40,19 @@ requirements:
     - pysam
     - raxml >=8.2.10
     - samtools >=1.9
-    - r-base
+    - r-base >=4
     - r-essentials
     - r-optparse
     - r-rbiom
     - r-ape
     - r-compositions
     - r-biocmanager
     - bioconductor-microbiome
-
+    - hclust2
 test:
   commands:
     - metaphlan -v
-    - strainphlan -h
+    - strainphlan -v
     - extract_markers.py -h
     - merge_metaphlan_tables.py -h
     - read_fastx.py -h
@@ -61,6 +61,10 @@ test:
     - sample2markers.py -h
     - strain_transmission.py -h
     - sgb_to_gtdb_profile.py -h
+    - metaphlan2krona.py -h    
+    - hclust2.py -h
+    - run_treeshrink.py -h
+    - treeshrink.py -h
 
 about:
   home: https://github.com/biobakery/metaphlan

diff --git a/metaphlan/metaphlan.py b/metaphlan/metaphlan.py
@@ -4,8 +4,8 @@
               'Nicola Segata ([email protected]), '
               'Duy Tin Truong, '
               'Francesco Asnicar ([email protected])')
-__version__ = '4.0.1'
-__date__ = '24 Aug 2022'
+__version__ = '4.0.2'
+__date__ = '22 Sep 2022'
 
 import sys
 try:
@@ -21,6 +21,7 @@
 import stat
 import re
 import time
+import random
 from collections import defaultdict as defdict
 from distutils.version import LooseVersion
 from glob import glob
@@ -335,6 +336,10 @@ def read_params(args):
     arg = g.add_argument
     arg('--nproc', metavar="N", type=int, default=4,
         help="The number of CPUs to use for parallelizing the mapping [default 4]")
+    arg('--subsampling', type=int, default=None,
+        help="Specify the number of reads to be considered from the input metagenomes [default None]")
+    arg('--subsampling_seed', type=str, default='1992',
+        help="Random seed to use in the selection of the subsampled reads. Choose \"random\r for a random behaviour")
     arg('--install', action='store_true',
         help="Only checks if the MetaPhlAn DB is installed and installs it if not. All other parameters are ignored.")
     arg('--offline', action='store_true',
@@ -819,7 +824,14 @@ def mapq_filter(marker_name, mapq_value, min_mapq_val):
             return True
     return False
 
-def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=None):
+
+def separate_reads2markers(reads2markers):
+    if not SGB_ANALYSIS:
+        return reads2markers, {}
+    else:
+        return {r: m for r, m in reads2markers.items() if ('SGB' in m or 'EUK' in m) and not 'VDB' in m}, {r: m for r, m in reads2markers.items() if 'VDB' in m and not ('SGB' in m or 'EUK' in m)}
+
+def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=None, subsampling=None, subsampling_seed='1992'):
     if not mapping_f:
         ras, ras_line, inpf = plain_read_and_split, plain_read_and_split_line, sys.stdin
     else:
@@ -851,7 +863,31 @@ def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=
             ):
                     reads2markers[o[0]] = o[2].split('/')[0]
     inpf.close()
-    markers2reads = defdict(set)
+
+    if subsampling != None:
+        if subsampling >= n_metagenome_reads:
+            sys.stderr.write("WARNING: The specified subsampling ({}) is higher than the original number of reads ({}).".format(subsampling, n_metagenome_reads))
+        elif subsampling < 10000:
+            sys.stderr.write("WARNING: The specified subsampling ({}) is below the recommended minimum of 10,000 reads.".format(subsampling))
+        else:
+            reads2markers =  dict(sorted(reads2markers.items()))
+            if subsampling_seed.lower() != 'random':
+                random.seed(int(subsampling_seed))
+            reads2filtmarkers = {}
+            sgb_reads2markers, viral_reads2markers = separate_reads2markers(reads2markers)            
+            n_sgb_mapped_reads = int((len(sgb_reads2markers) * subsampling) / n_metagenome_reads)
+            reads2filtmarkers = { r:sgb_reads2markers[r] for r in random.sample(list(sgb_reads2markers.keys()), n_sgb_mapped_reads) }     
+            if SGB_ANALYSIS:       
+                n_viral_mapped_reads = int((len(viral_reads2markers) * subsampling) / n_metagenome_reads)
+                reads2filtmarkers.update({ r:viral_reads2markers[r] for r in random.sample(list(viral_reads2markers.keys()), n_viral_mapped_reads) })            
+            reads2markers = reads2filtmarkers
+            sgb_reads2markers.clear()
+            viral_reads2markers.clear()
+            n_metagenome_reads = subsampling
+    elif n_metagenome_reads < 10000:
+        sys.stderr.write("WARNING: The number of reads in the sample ({}) is below the recommended minimum of 10,000 reads.".format(subsampling))
+
+    markers2reads = defdict(set)   
     for r, m in reads2markers.items():
         markers2reads[m].add(r)
 
@@ -952,6 +988,11 @@ def main():
     SGB_ANALYSIS = not pars['mpa3']
 
     ESTIMATE_UNK = pars['unclassified_estimation']
+
+    if not (pars['subsampling_seed'].lower() == 'random' or pars['subsampling_seed'].isdigit()):
+        sys.stderr.write("Error: The --subsampling_seed parameter is not accepted. It should contain an integer number or \"random\". Exiting...\n\n")
+        sys.exit(1)
+
 
     # check if the database is installed, if not then install
     pars['index'] = check_and_install_database(pars['index'], pars['bowtie2db'], pars['bowtie2_build'], pars['nproc'], pars['force_download'], pars['offline'])
@@ -1045,7 +1086,7 @@ def main():
     tree = TaxTree( mpa_pkl, ignore_markers )
     tree.set_min_cu_len( pars['min_cu_len'] )
 
-    markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'])
+    markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'], pars['subsampling'], pars['subsampling_seed'])
 
     tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])
 
@@ -1089,6 +1130,7 @@ def main():
         if not MPA2_OUTPUT:
             outf.write('#{}\n'.format(pars['index']))
             outf.write('#{}\n'.format(' '.join(sys.argv)))
+            outf.write('#{} reads processed\n'.format(n_metagenome_reads))
 
         if not CAMI_OUTPUT:
             outf.write('#' + '\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n')