Skip to content

Commit

Permalink
Merge pull request #155 from ljmciver/master
Browse files Browse the repository at this point in the history
v3.0.12 alpha, unknown estimate changes
  • Loading branch information
abmiguez authored Jul 21, 2021
2 parents 0b8cc95 + e70dc94 commit 5b29465
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 39 deletions.
107 changes: 71 additions & 36 deletions metaphlan/metaphlan.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
'Duy Tin Truong, '
'Francesco Asnicar ([email protected]), '
'Aitor Blanco Miguez ([email protected])')
__version__ = '3.0.11'
__date__ = '07 Jul 2021'
__version__ = '3.0.12'
__date__ = '14 Jul 2021'

import sys
try:
Expand Down Expand Up @@ -66,6 +66,9 @@
INDEX = 'latest'
tax_units = "kpcofgst"

# set default parameters
GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE=1.05

def read_params(args):
p = ap.ArgumentParser( description=
"DESCRIPTION\n"
Expand Down Expand Up @@ -585,6 +588,10 @@ def compute_abundance( self ):

if rat < 0.0:
pass
elif self.stat == 'unknown':
wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
den,num = zip(*[v[1:] for v in wnreads])
loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0
elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']):
loc_ab = nrawreads / rat if rat >= 0 else 0.0
elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']):
Expand Down Expand Up @@ -633,8 +640,9 @@ def get_all_abundances( self ):
return ret

class TaxTree:
def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ):
def __init__( self, mpa, markers_to_ignore = None, unknown_calculation = None ): #, min_cu_len ):
self.root = TaxClade( "root", 0)
self.unknown_calculation = unknown_calculation
self.all_clades, self.markers2lens, self.markers2clades, self.taxa2clades, self.markers2exts = {}, {}, {}, {}, {}
TaxClade.markers2lens = self.markers2lens
TaxClade.markers2exts = self.markers2exts
Expand Down Expand Up @@ -669,7 +677,11 @@ def add_lens( node ):
lens = []
for c in node.children.values():
lens.append( add_lens( c ) )
node.glen = min(np.mean(lens), np.median(lens))
# use a different length for the unknown calculation
if self.unknown_calculation:
node.glen = np.median(lens) * GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE
else:
node.glen = min(np.mean(lens), np.median(lens))
return node.glen

add_lens(self.root)
Expand Down Expand Up @@ -919,6 +931,50 @@ def to_biomformat(clade_name):

return True

def create_tree_and_add_reads(mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=False):
# Create an instance of the TaxTree and update the stats settings and then add the reads and markers
tree = TaxTree( mpa_pkl, ignore_markers , unknown_calculation = unknown_calculation)
tree.set_min_cu_len( pars['min_cu_len'] )
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] )

map_out = []
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
if marker not in tree.markers2lens:
continue
tax_seq, ids_seq = tree.add_reads( marker, len(reads),
add_viruses = pars['add_viruses'],
ignore_eukaryotes = pars['ignore_eukaryotes'],
ignore_bacteria = pars['ignore_bacteria'],
ignore_archaea = pars['ignore_archaea'],
)
if tax_seq:
map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)]

return tree, map_out

def compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=False):
# Using the tree compute the relative abundance and then the fraction of mapped reads using the total mapped reads

# if computing the unknown calculation, set the tree settings to reset to the unknown stat
# must be set here and then reset as this is a global setting that will apply to all trees if set
if unknown_calculation:
tree.set_stat( 'unknown', pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] )

cl2ab, _, tot_nreads = tree.relative_abundances(
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )

if unknown_calculation:
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] )

# If the mapped reads are over-estimated, set the ratio at 1
fraction_mapped_reads = min(tot_nreads/float(n_metagenome_reads), 1.0) if ESTIMATE_UNK else 1.0

# check for a negative fraction
if unknown_calculation and fraction_mapped_reads < 0:
fraction_mapped_reads = 1.0

return cl2ab, fraction_mapped_reads, tot_nreads

def main():
ranks2code = { 'k' : 'superkingdom', 'p' : 'phylum', 'c':'class',
'o' : 'order', 'f' : 'family', 'g' : 'genus', 's' : 'species'}
Expand Down Expand Up @@ -1008,12 +1064,12 @@ def main():
mpa_pkl = pickle.load( a )

REPORT_MERGED = mpa_pkl.get('merged_taxon',False)
tree = TaxTree( mpa_pkl, ignore_markers )
tree.set_min_cu_len( pars['min_cu_len'] )

markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'])

tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])
# create a tree for the relative abundance and unknown calculations (each use different default computations as unknown includes all the alignments)
tree, map_out = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads)
tree_unknown, map_out_unknown = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=True)

if no_map:
os.remove( pars['inp'] )
Expand All @@ -1029,19 +1085,6 @@ def main():
"\nExiting...\n\n" )
sys.exit(1)

map_out = []
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
if marker not in tree.markers2lens:
continue
tax_seq, ids_seq = tree.add_reads( marker, len(reads),
add_viruses = pars['add_viruses'],
ignore_eukaryotes = pars['ignore_eukaryotes'],
ignore_bacteria = pars['ignore_bacteria'],
ignore_archaea = pars['ignore_archaea'],
)
if tax_seq:
map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)]

if pars['output'] is None and pars['output_file'] is not None:
pars['output'] = pars['output_file']

Expand All @@ -1057,17 +1100,6 @@ def main():
if not CAMI_OUTPUT:
outf.write('#' + '\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n')

if ESTIMATE_UNK:
mapped_reads = 0
cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
for c, m in cl2pr.items():
markers_cov = [a / 1000 for _, a in m]
mapped_reads += np.mean(markers_cov) * tree.all_clades[c.split('|')[-1]].glen
# If the mapped reads are over-estimated, set the ratio at 1
fraction_mapped_reads = min(mapped_reads/float(n_metagenome_reads), 1.0)
else:
fraction_mapped_reads = 1.0

if pars['t'] == 'reads_map':
if not MPA2_OUTPUT:
outf.write('#read_id\tNCBI_taxlineage_str\tNCBI_taxlineage_ids\n')
Expand All @@ -1082,9 +1114,11 @@ def main():
else:
outf.write('#clade_name\tNCBI_tax_id\trelative_abundance\n')

cl2ab, _, tot_nreads = tree.relative_abundances(
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )

cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK)

# compute the mapped reads fraction again considering all alignments
cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True)

outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in cl2ab.items() if relab > 0.0]
has_repr = False

Expand Down Expand Up @@ -1134,8 +1168,9 @@ def main():
maybe_generate_biom_file(tree, pars, outpred)

elif pars['t'] == 'rel_ab_w_read_stats':
cl2ab, rr, tot_nreads = tree.relative_abundances(
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK)
# compute the mapped reads fraction again considering all alignments
cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True)

unmapped_reads = max(n_metagenome_reads - tot_nreads, 0)

Expand Down
4 changes: 2 additions & 2 deletions metaphlan/strainphlan.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
'Francesco Asnicar ([email protected]), '
'Moreno Zolfo ([email protected]), '
'Francesco Beghini ([email protected])')
__version__ = '3.0.11'
__date__ = '07 Jul 2021'
__version__ = '3.0.12'
__date__ = '14 Jul 2021'


import sys
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setuptools.setup(
name='MetaPhlAn',
version='3.0.11',
version='3.0.12',
author='Francesco Beghini',
author_email='[email protected]',
url='http://github.com/biobakery/MetaPhlAn/',
Expand Down

0 comments on commit 5b29465

Please sign in to comment.