diff --git a/metaphlan/metaphlan.py b/metaphlan/metaphlan.py index 92cf1f4..1b83c03 100755 --- a/metaphlan/metaphlan.py +++ b/metaphlan/metaphlan.py @@ -4,8 +4,8 @@ 'Duy Tin Truong, ' 'Francesco Asnicar (f.asnicar@unitn.it), ' 'Aitor Blanco Miguez (aitor.blancomiguez@unitn.it)') -__version__ = '3.0.11' -__date__ = '07 Jul 2021' +__version__ = '3.0.12' +__date__ = '14 Jul 2021' import sys try: @@ -66,6 +66,9 @@ INDEX = 'latest' tax_units = "kpcofgst" +# set default parameters +GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE=1.05 + def read_params(args): p = ap.ArgumentParser( description= "DESCRIPTION\n" @@ -585,6 +588,10 @@ def compute_abundance( self ): if rat < 0.0: pass + elif self.stat == 'unknown': + wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0]) + den,num = zip(*[v[1:] for v in wnreads]) + loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0 elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']): loc_ab = nrawreads / rat if rat >= 0 else 0.0 elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']): @@ -633,8 +640,9 @@ def get_all_abundances( self ): return ret class TaxTree: - def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ): + def __init__( self, mpa, markers_to_ignore = None, unknown_calculation = None ): #, min_cu_len ): self.root = TaxClade( "root", 0) + self.unknown_calculation = unknown_calculation self.all_clades, self.markers2lens, self.markers2clades, self.taxa2clades, self.markers2exts = {}, {}, {}, {}, {} TaxClade.markers2lens = self.markers2lens TaxClade.markers2exts = self.markers2exts @@ -669,7 +677,11 @@ def add_lens( node ): lens = [] for c in node.children.values(): lens.append( add_lens( c ) ) - node.glen = min(np.mean(lens), np.median(lens)) + # use a different length for the unknown calculation + if self.unknown_calculation: + node.glen = np.median(lens) * GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE + else: + node.glen = min(np.mean(lens), np.median(lens)) return node.glen add_lens(self.root) @@ -919,6 +931,50 @@ def to_biomformat(clade_name): return True +def create_tree_and_add_reads(mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=False): + # Create an instance of the TaxTree and update the stats settings and then add the reads and markers + tree = TaxTree( mpa_pkl, ignore_markers , unknown_calculation = unknown_calculation) + tree.set_min_cu_len( pars['min_cu_len'] ) + tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) + + map_out = [] + for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): + if marker not in tree.markers2lens: + continue + tax_seq, ids_seq = tree.add_reads( marker, len(reads), + add_viruses = pars['add_viruses'], + ignore_eukaryotes = pars['ignore_eukaryotes'], + ignore_bacteria = pars['ignore_bacteria'], + ignore_archaea = pars['ignore_archaea'], + ) + if tax_seq: + map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)] + + return tree, map_out + +def compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=False): + # Using the tree compute the relative abundance and then the fraction of mapped reads using the total mapped reads + + # if computing the unknown calculation, set the tree settings to reset to the unknown stat + # must be set here and then reset as this is a global setting that will apply to all trees if set + if unknown_calculation: + tree.set_stat( 'unknown', pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) + + cl2ab, _, tot_nreads = tree.relative_abundances( + pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) + + if unknown_calculation: + tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) + + # If the mapped reads are over-estimated, set the ratio at 1 + fraction_mapped_reads = min(tot_nreads/float(n_metagenome_reads), 1.0) if ESTIMATE_UNK else 1.0 + + # check for a negative fraction + if unknown_calculation and fraction_mapped_reads < 0: + fraction_mapped_reads = 1.0 + + return cl2ab, fraction_mapped_reads, tot_nreads + def main(): ranks2code = { 'k' : 'superkingdom', 'p' : 'phylum', 'c':'class', 'o' : 'order', 'f' : 'family', 'g' : 'genus', 's' : 'species'} @@ -1008,12 +1064,12 @@ def main(): mpa_pkl = pickle.load( a ) REPORT_MERGED = mpa_pkl.get('merged_taxon',False) - tree = TaxTree( mpa_pkl, ignore_markers ) - tree.set_min_cu_len( pars['min_cu_len'] ) markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len']) - tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm']) + # create a tree for the relative abundance and unknown calculations (each use different default computations as unknown includes all the alignments) + tree, map_out = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads) + tree_unknown, map_out_unknown = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=True) if no_map: os.remove( pars['inp'] ) @@ -1029,19 +1085,6 @@ def main(): "\nExiting...\n\n" ) sys.exit(1) - map_out = [] - for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): - if marker not in tree.markers2lens: - continue - tax_seq, ids_seq = tree.add_reads( marker, len(reads), - add_viruses = pars['add_viruses'], - ignore_eukaryotes = pars['ignore_eukaryotes'], - ignore_bacteria = pars['ignore_bacteria'], - ignore_archaea = pars['ignore_archaea'], - ) - if tax_seq: - map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)] - if pars['output'] is None and pars['output_file'] is not None: pars['output'] = pars['output_file'] @@ -1057,17 +1100,6 @@ def main(): if not CAMI_OUTPUT: outf.write('#' + '\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n') - if ESTIMATE_UNK: - mapped_reads = 0 - cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) - for c, m in cl2pr.items(): - markers_cov = [a / 1000 for _, a in m] - mapped_reads += np.mean(markers_cov) * tree.all_clades[c.split('|')[-1]].glen - # If the mapped reads are over-estimated, set the ratio at 1 - fraction_mapped_reads = min(mapped_reads/float(n_metagenome_reads), 1.0) - else: - fraction_mapped_reads = 1.0 - if pars['t'] == 'reads_map': if not MPA2_OUTPUT: outf.write('#read_id\tNCBI_taxlineage_str\tNCBI_taxlineage_ids\n') @@ -1082,9 +1114,11 @@ def main(): else: outf.write('#clade_name\tNCBI_tax_id\trelative_abundance\n') - cl2ab, _, tot_nreads = tree.relative_abundances( - pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) - + cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK) + + # compute the mapped reads fraction again considering all alignments + cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True) + outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in cl2ab.items() if relab > 0.0] has_repr = False @@ -1134,8 +1168,9 @@ def main(): maybe_generate_biom_file(tree, pars, outpred) elif pars['t'] == 'rel_ab_w_read_stats': - cl2ab, rr, tot_nreads = tree.relative_abundances( - pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) + cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK) + # compute the mapped reads fraction again considering all alignments + cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True) unmapped_reads = max(n_metagenome_reads - tot_nreads, 0) diff --git a/metaphlan/strainphlan.py b/metaphlan/strainphlan.py index 879af43..9780143 100755 --- a/metaphlan/strainphlan.py +++ b/metaphlan/strainphlan.py @@ -4,8 +4,8 @@ 'Francesco Asnicar (f.asnicar@unitn.it), ' 'Moreno Zolfo (moreno.zolfo@unitn.it), ' 'Francesco Beghini (francesco.beghini@unitn.it)') -__version__ = '3.0.11' -__date__ = '07 Jul 2021' +__version__ = '3.0.12' +__date__ = '14 Jul 2021' import sys diff --git a/setup.py b/setup.py index c3addc4..4771a83 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup( name='MetaPhlAn', - version='3.0.11', + version='3.0.12', author='Francesco Beghini', author_email='francesco.beghini@unitn.it', url='http://github.com/biobakery/MetaPhlAn/',