-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #155 from ljmciver/master
v3.0.12 alpha, unknown estimate changes
- Loading branch information
Showing
3 changed files
with
74 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,8 @@ | |
'Duy Tin Truong, ' | ||
'Francesco Asnicar ([email protected]), ' | ||
'Aitor Blanco Miguez ([email protected])') | ||
__version__ = '3.0.11' | ||
__date__ = '07 Jul 2021' | ||
__version__ = '3.0.12' | ||
__date__ = '14 Jul 2021' | ||
|
||
import sys | ||
try: | ||
|
@@ -66,6 +66,9 @@ | |
INDEX = 'latest' | ||
tax_units = "kpcofgst" | ||
|
||
# set default parameters | ||
GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE=1.05 | ||
|
||
def read_params(args): | ||
p = ap.ArgumentParser( description= | ||
"DESCRIPTION\n" | ||
|
@@ -585,6 +588,10 @@ def compute_abundance( self ): | |
|
||
if rat < 0.0: | ||
pass | ||
elif self.stat == 'unknown': | ||
wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0]) | ||
den,num = zip(*[v[1:] for v in wnreads]) | ||
loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0 | ||
elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']): | ||
loc_ab = nrawreads / rat if rat >= 0 else 0.0 | ||
elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']): | ||
|
@@ -633,8 +640,9 @@ def get_all_abundances( self ): | |
return ret | ||
|
||
class TaxTree: | ||
def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ): | ||
def __init__( self, mpa, markers_to_ignore = None, unknown_calculation = None ): #, min_cu_len ): | ||
self.root = TaxClade( "root", 0) | ||
self.unknown_calculation = unknown_calculation | ||
self.all_clades, self.markers2lens, self.markers2clades, self.taxa2clades, self.markers2exts = {}, {}, {}, {}, {} | ||
TaxClade.markers2lens = self.markers2lens | ||
TaxClade.markers2exts = self.markers2exts | ||
|
@@ -669,7 +677,11 @@ def add_lens( node ): | |
lens = [] | ||
for c in node.children.values(): | ||
lens.append( add_lens( c ) ) | ||
node.glen = min(np.mean(lens), np.median(lens)) | ||
# use a different length for the unknown calculation | ||
if self.unknown_calculation: | ||
node.glen = np.median(lens) * GENOME_LENGTH_BOOST_FOR_UNKNOWN_ESTIMATE | ||
else: | ||
node.glen = min(np.mean(lens), np.median(lens)) | ||
return node.glen | ||
|
||
add_lens(self.root) | ||
|
@@ -919,6 +931,50 @@ def to_biomformat(clade_name): | |
|
||
return True | ||
|
||
def create_tree_and_add_reads(mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=False): | ||
# Create an instance of the TaxTree and update the stats settings and then add the reads and markers | ||
tree = TaxTree( mpa_pkl, ignore_markers , unknown_calculation = unknown_calculation) | ||
tree.set_min_cu_len( pars['min_cu_len'] ) | ||
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) | ||
|
||
map_out = [] | ||
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): | ||
if marker not in tree.markers2lens: | ||
continue | ||
tax_seq, ids_seq = tree.add_reads( marker, len(reads), | ||
add_viruses = pars['add_viruses'], | ||
ignore_eukaryotes = pars['ignore_eukaryotes'], | ||
ignore_bacteria = pars['ignore_bacteria'], | ||
ignore_archaea = pars['ignore_archaea'], | ||
) | ||
if tax_seq: | ||
map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)] | ||
|
||
return tree, map_out | ||
|
||
def compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=False): | ||
# Using the tree compute the relative abundance and then the fraction of mapped reads using the total mapped reads | ||
|
||
# if computing the unknown calculation, set the tree settings to reset to the unknown stat | ||
# must be set here and then reset as this is a global setting that will apply to all trees if set | ||
if unknown_calculation: | ||
tree.set_stat( 'unknown', pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) | ||
|
||
cl2ab, _, tot_nreads = tree.relative_abundances( | ||
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) | ||
|
||
if unknown_calculation: | ||
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'] ) | ||
|
||
# If the mapped reads are over-estimated, set the ratio at 1 | ||
fraction_mapped_reads = min(tot_nreads/float(n_metagenome_reads), 1.0) if ESTIMATE_UNK else 1.0 | ||
|
||
# check for a negative fraction | ||
if unknown_calculation and fraction_mapped_reads < 0: | ||
fraction_mapped_reads = 1.0 | ||
|
||
return cl2ab, fraction_mapped_reads, tot_nreads | ||
|
||
def main(): | ||
ranks2code = { 'k' : 'superkingdom', 'p' : 'phylum', 'c':'class', | ||
'o' : 'order', 'f' : 'family', 'g' : 'genus', 's' : 'species'} | ||
|
@@ -1008,12 +1064,12 @@ def main(): | |
mpa_pkl = pickle.load( a ) | ||
|
||
REPORT_MERGED = mpa_pkl.get('merged_taxon',False) | ||
tree = TaxTree( mpa_pkl, ignore_markers ) | ||
tree.set_min_cu_len( pars['min_cu_len'] ) | ||
|
||
markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len']) | ||
|
||
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm']) | ||
# create a tree for the relative abundance and unknown calculations (each use different default computations as unknown includes all the alignments) | ||
tree, map_out = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads) | ||
tree_unknown, map_out_unknown = create_tree_and_add_reads( mpa_pkl, ignore_markers, pars, avg_read_length, markers2reads, unknown_calculation=True) | ||
|
||
if no_map: | ||
os.remove( pars['inp'] ) | ||
|
@@ -1029,19 +1085,6 @@ def main(): | |
"\nExiting...\n\n" ) | ||
sys.exit(1) | ||
|
||
map_out = [] | ||
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): | ||
if marker not in tree.markers2lens: | ||
continue | ||
tax_seq, ids_seq = tree.add_reads( marker, len(reads), | ||
add_viruses = pars['add_viruses'], | ||
ignore_eukaryotes = pars['ignore_eukaryotes'], | ||
ignore_bacteria = pars['ignore_bacteria'], | ||
ignore_archaea = pars['ignore_archaea'], | ||
) | ||
if tax_seq: | ||
map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)] | ||
|
||
if pars['output'] is None and pars['output_file'] is not None: | ||
pars['output'] = pars['output_file'] | ||
|
||
|
@@ -1057,17 +1100,6 @@ def main(): | |
if not CAMI_OUTPUT: | ||
outf.write('#' + '\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n') | ||
|
||
if ESTIMATE_UNK: | ||
mapped_reads = 0 | ||
cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) | ||
for c, m in cl2pr.items(): | ||
markers_cov = [a / 1000 for _, a in m] | ||
mapped_reads += np.mean(markers_cov) * tree.all_clades[c.split('|')[-1]].glen | ||
# If the mapped reads are over-estimated, set the ratio at 1 | ||
fraction_mapped_reads = min(mapped_reads/float(n_metagenome_reads), 1.0) | ||
else: | ||
fraction_mapped_reads = 1.0 | ||
|
||
if pars['t'] == 'reads_map': | ||
if not MPA2_OUTPUT: | ||
outf.write('#read_id\tNCBI_taxlineage_str\tNCBI_taxlineage_ids\n') | ||
|
@@ -1082,9 +1114,11 @@ def main(): | |
else: | ||
outf.write('#clade_name\tNCBI_tax_id\trelative_abundance\n') | ||
|
||
cl2ab, _, tot_nreads = tree.relative_abundances( | ||
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) | ||
|
||
cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK) | ||
|
||
# compute the mapped reads fraction again considering all alignments | ||
cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True) | ||
|
||
outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in cl2ab.items() if relab > 0.0] | ||
has_repr = False | ||
|
||
|
@@ -1134,8 +1168,9 @@ def main(): | |
maybe_generate_biom_file(tree, pars, outpred) | ||
|
||
elif pars['t'] == 'rel_ab_w_read_stats': | ||
cl2ab, rr, tot_nreads = tree.relative_abundances( | ||
pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None ) | ||
cl2ab, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK) | ||
# compute the mapped reads fraction again considering all alignments | ||
cl2ab_unknown, fraction_mapped_reads, tot_nreads = compute_relative_abundance_and_fraction_of_mapped_reads(tree_unknown, pars, n_metagenome_reads, avg_read_length, ESTIMATE_UNK, unknown_calculation=True) | ||
|
||
unmapped_reads = max(n_metagenome_reads - tot_nreads, 0) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,8 @@ | |
'Francesco Asnicar ([email protected]), ' | ||
'Moreno Zolfo ([email protected]), ' | ||
'Francesco Beghini ([email protected])') | ||
__version__ = '3.0.11' | ||
__date__ = '07 Jul 2021' | ||
__version__ = '3.0.12' | ||
__date__ = '14 Jul 2021' | ||
|
||
|
||
import sys | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
|
||
setuptools.setup( | ||
name='MetaPhlAn', | ||
version='3.0.11', | ||
version='3.0.12', | ||
author='Francesco Beghini', | ||
author_email='[email protected]', | ||
url='http://github.com/biobakery/MetaPhlAn/', | ||
|