Skip to content

Commit

Permalink
Fix normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
Francesco Beghini committed Nov 10, 2020
1 parent 18e3337 commit 75bb198
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 73 deletions.
42 changes: 28 additions & 14 deletions metaphlan/metaphlan.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
'Nicola Segata ([email protected]), '
'Duy Tin Truong, '
'Francesco Asnicar ([email protected])')
__version__ = '3.0.4'
__date__ = '23 Sep 2020'
__version__ = '3.0.5'
__date__ = '10 Nov 2020'

import sys
try:
Expand Down Expand Up @@ -422,12 +422,18 @@ def run_bowtie2(fna_in, outfmt6_out, bowtie2_db, preset, nproc, min_mapq_val, fi
p.communicate()
read_fastx_stderr = readin.stderr.readlines()
nreads = None
avg_read_length = None
try:
nreads = int(read_fastx_stderr[0])
nreads, avg_read_length = list(map(float, read_fastx_stderr[0].decode().split()))
if not nreads:
sys.stderr.write('Fatal error running MetaPhlAn. Total metagenome size was not estimated.\nPlease check your input files.\n')
sys.exit(1)
outf.write(lmybytes('#nreads\t{}'.format(nreads)))
if not avg_read_length:
sys.stderr.write('Fatal error running MetaPhlAn. The average read length was not estimated.\nPlease check your input files.\n')
sys.exit(1)

outf.write(lmybytes('#nreads\t{}\n'.format(int(nreads))))
outf.write(lmybytes('#avg_read_length\t{}'.format(avg_read_length)))
outf.close()
except ValueError:
sys.stderr.write(b''.join(read_fastx_stderr).decode())
Expand Down Expand Up @@ -457,6 +463,7 @@ class TaxClade:
perc_nonzero = None
quantile = None
avoid_disqm = False
avg_read_length = 1

def __init__( self, name, tax_id, uncl = False):
self.children, self.markers2nreads = {}, {}
Expand Down Expand Up @@ -500,7 +507,7 @@ def get_full_name( self ):
return "|".join(fullname[1:])

def get_normalized_counts( self ):
return [(m,float(n)*1000.0/self.markers2lens[m])
return [(m,float(n)*1000.0/(self.markers2lens[m] - self.avg_read_length +1) )
for m,n in self.markers2nreads.items()]

def compute_mapped_reads( self ):
Expand Down Expand Up @@ -577,24 +584,24 @@ def compute_abundance( self ):
elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']):
loc_ab = nrawreads / rat if rat >= 0 else 0.0
elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']):
loc_ab = np.mean([float(n)/r for r,n in rat_nreads])
loc_ab = np.mean([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads])
elif self.stat == 'tavg_g':
wnreads = sorted([(float(n)/r,r,n) for r,n in rat_nreads], key=lambda x:x[0])
wnreads = sorted([(float(n)/(r-self.avg_read_length+1),(r - self.avg_read_length+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
den,num = zip(*[v[1:] for v in wnreads[ql:qr]])
loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0
elif self.stat == 'tavg_l':
loc_ab = np.mean(sorted([float(n)/r for r,n in rat_nreads])[ql:qr])
loc_ab = np.mean(sorted([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads])[ql:qr])
elif self.stat == 'wavg_g':
vmin, vmax = nreads_v[ql], nreads_v[qr]
wnreads = [vmin]*qn+list(nreads_v[ql:qr])+[vmax]*qn
loc_ab = float(sum(wnreads)) / rat
elif self.stat == 'wavg_l':
wnreads = sorted([float(n)/r for r,n in rat_nreads])
wnreads = sorted([float(n)/(r - self.avg_read_length + 1) for r,n in rat_nreads])
vmin, vmax = wnreads[ql], wnreads[qr]
wnreads = [vmin]*qn+list(wnreads[ql:qr])+[vmax]*qn
loc_ab = np.mean(wnreads)
elif self.stat == 'med':
loc_ab = np.median(sorted([float(n)/r for r,n in rat_nreads])[ql:qr])
loc_ab = np.median(sorted([float(n)/(r - self.avg_read_length +1) for r,n in rat_nreads])[ql:qr])

self.abundance = loc_ab
if rat < self.min_cu_len and self.children:
Expand Down Expand Up @@ -628,6 +635,7 @@ def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ):
TaxClade.markers2lens = self.markers2lens
TaxClade.markers2exts = self.markers2exts
TaxClade.taxa2clades = self.taxa2clades
self.avg_read_length = 1

for clade, value in mpa['taxonomy'].items():
clade = clade.strip().split("|")
Expand Down Expand Up @@ -674,11 +682,12 @@ def add_lens( node ):
def set_min_cu_len( self, min_cu_len ):
TaxClade.min_cu_len = min_cu_len

def set_stat( self, stat, quantile, perc_nonzero, avoid_disqm = False):
def set_stat( self, stat, quantile, perc_nonzero, avg_read_length, avoid_disqm = False):
TaxClade.stat = stat
TaxClade.perc_nonzero = perc_nonzero
TaxClade.quantile = quantile
TaxClade.avoid_disqm = avoid_disqm
TaxClade.avg_read_length = avg_read_length

def add_reads( self, marker, n,
add_viruses = False,
Expand Down Expand Up @@ -794,11 +803,14 @@ def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=

reads2markers = {}
n_metagenome_reads = None
avg_read_length = 1 #Set to 1 if it is not calculated from read_fastx

if input_type == 'bowtie2out':
for r, c in ras(inpf):
if r.startswith('#') and 'nreads' in r:
n_metagenome_reads = int(c)
if r.startswith('#') and 'avg_read_length' in r:
avg_read_length = float(c)
else:
reads2markers[r] = c
elif input_type == 'sam':
Expand All @@ -816,7 +828,7 @@ def map2bbh(mapping_f, min_mapq_val, input_type='bowtie2out', min_alignment_len=
for r, m in reads2markers.items():
markers2reads[m].add(r)

return (markers2reads, n_metagenome_reads)
return (markers2reads, n_metagenome_reads, avg_read_length)

def maybe_generate_biom_file(tree, pars, abundance_predictions):
json_key = "MetaPhlAn"
Expand Down Expand Up @@ -994,9 +1006,11 @@ def main():
REPORT_MERGED = mpa_pkl.get('merged_taxon',False)
tree = TaxTree( mpa_pkl, ignore_markers )
tree.set_min_cu_len( pars['min_cu_len'] )
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], pars['avoid_disqm'])

markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'])

tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])

markers2reads, n_metagenome_reads = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'])
if no_map:
os.remove( pars['inp'] )

Expand Down
119 changes: 61 additions & 58 deletions metaphlan/utils/read_fastx.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,75 +55,75 @@ def fopen(fn):

def read_and_write_raw_int(fd, min_len=None):
fmt = None
avg_read_length = 0
nreads = 0
discarded = 0
if min_len:
r = []
#if min_len:
r = []

while True:
l = fd.readline()

if not fmt:
fmt = fastx(l)
parser = FastqGeneralIterator if fmt == 'fastq' else SimpleFastaParser
readn = 4 if fmt == 'fastq' else 2

r.append(l)

if len(r) == readn:
break

for record in parser(uio.StringIO("".join(r))):
if readn == 4:
description, sequence, qual = record
else:
qual = None
description, sequence = record

if len(sequence) >= min_len:
description = ignore_spaces(description, forced=True)
_ = sys.stdout.write(print_record(description, sequence, qual, fmt))
else:
discarded = discarded + 1

for idx, record in enumerate(parser(fd),2):
if readn == 4:
description, sequence, qual = record
else:
qual = None
description, sequence = record

if len(sequence) >= min_len:
description = ignore_spaces(description, forced=True)
_ = sys.stdout.write(print_record(description, sequence, qual, fmt))
else:
discarded = discarded + 1
else:
for idx, l in enumerate(fd,1):
_ = sys.stdout.write(ignore_spaces(l))

#Read again the first line of the file to determine if is a fasta or a fastq
fd.seek(0)
while True:
l = fd.readline()
readn = 4 if fastx(l) == 'fastq' else 2
idx = idx // readn

if not fmt:
fmt = fastx(l)
parser = FastqGeneralIterator if fmt == 'fastq' else SimpleFastaParser
readn = 4 if fmt == 'fastq' else 2

r.append(l)

if len(r) == readn:
break

for record in parser(uio.StringIO("".join(r))):
if readn == 4:
description, sequence, qual = record
else:
qual = None
description, sequence = record

if len(sequence) >= min_len:
description = ignore_spaces(description, forced=True)
avg_read_length = len(sequence) + avg_read_length
_ = sys.stdout.write(print_record(description, sequence, qual, fmt))
else:
discarded = discarded + 1

for idx, record in enumerate(parser(fd),2):
if readn == 4:
description, sequence, qual = record
else:
qual = None
description, sequence = record

if len(sequence) >= min_len:
avg_read_length = len(sequence) + avg_read_length
description = ignore_spaces(description, forced=True)
_ = sys.stdout.write(print_record(description, sequence, qual, fmt))
else:
discarded = discarded + 1
# else:
# for idx, l in enumerate(fd,1):
# avg_read_length = len(l) + avg_read_length
# _ = sys.stdout.write(ignore_spaces(l))

nreads = idx - discarded
return nreads
avg_read_length /= nreads
return (nreads, avg_read_length)

def read_and_write_raw(fd, opened=False, min_len=None):
if opened: #fd is stdin
nreads = read_and_write_raw_int(fd, min_len=min_len)
nreads, avg_read_length = read_and_write_raw_int(fd, min_len=min_len)
else:
with fopen(fd) as inf:
nreads = read_and_write_raw_int(inf, min_len=min_len)
return nreads
nreads, avg_read_length = read_and_write_raw_int(inf, min_len=min_len)
return (nreads, avg_read_length)


def main():
min_len = None
min_len = 0
args = []
nreads = None
avg_read_length = None

if len(sys.argv) > 1:
for l in sys.argv[1:]:
Expand All @@ -140,10 +140,10 @@ def main():
args.append(l)

if len(args) == 0:
nreads = read_and_write_raw(sys.stdin, opened=True, min_len=min_len)
nreads, avg_read_length = read_and_write_raw(sys.stdin, opened=True, min_len=min_len)
else:
files = []
nreads = 0
avg_read_length, nreads = 0, 0
for a in args:
for f in a.split(','):
if os.path.isdir(f):
Expand All @@ -152,10 +152,13 @@ def main():
files += [f]

for f in files:
nreads += read_and_write_raw(f, opened=False, min_len=min_len)
f_nreads, f_avg_read_length = read_and_write_raw(f, opened=False, min_len=min_len)
nreads += f_nreads
avg_read_length += f_avg_read_length
avg_read_length = avg_read_length / len(files)

if nreads:
sys.stderr.write(str(nreads))
if nreads and avg_read_length:
sys.stderr.write('{}\t{}'.format(nreads,avg_read_length) )
else:
exit(1)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setuptools.setup(
name='MetaPhlAn',
version='3.0.4',
version='3.0.5',
author='Francesco Beghini',
author_email='[email protected]',
url='http://github.com/biobakery/MetaPhlAn/',
Expand Down

0 comments on commit 75bb198

Please sign in to comment.