Skip to content

Commit

Permalink
add option to assess taxonomic binning as genome binning
Browse files Browse the repository at this point in the history
  • Loading branch information
fernandomeyer committed Mar 25, 2019
1 parent a5674fb commit 3a3259a
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 13 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ usage: AMBER [-h] -g GOLD_STANDARD_FILE [-f FASTA_FILE] [-l LABELS]
[-c] [-r REMOVE_GENOMES] [-k KEYWORD]
[--ncbi_nodes_file NCBI_NODES_FILE]
[--ncbi_names_file NCBI_NAMES_FILE]
[--rank_as_genome_binning RANK_AS_GENOME_BINNING]
bin_files [bin_files ...]

AMBER: Assessment of Metagenome BinnERs
Expand Down Expand Up @@ -135,6 +136,10 @@ taxonomic binning-specific arguments:
NCBI nodes file
--ncbi_names_file NCBI_NAMES_FILE
NCBI names file
--rank_as_genome_binning RANK_AS_GENOME_BINNING
Assess taxonomic binning at a rank also as genome
binning. Valid ranks: superkingdom, phylum, class,
order, family, genus, species, strain
~~~
**Example:**
~~~BASH
Expand Down
4 changes: 3 additions & 1 deletion amber.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def main(args=None):
group_t = parser.add_argument_group('taxonomic binning-specific arguments')
group_t.add_argument('--ncbi_nodes_file', help="NCBI nodes file", required=False)
group_t.add_argument('--ncbi_names_file', help="NCBI names file", required=False)
group_t.add_argument('--rank_as_genome_binning', help="Assess taxonomic binning at a rank also as genome binning. Valid ranks: superkingdom, phylum, class, order, family, genus, species, strain", required=False)

args = parser.parse_args(args)

Expand All @@ -286,7 +287,8 @@ def main(args=None):
filter_genomes_file=args.remove_genomes,
filter_keyword=args.keyword,
map_by_completeness=args.map_by_completeness,
min_length=args.min_length)
min_length=args.min_length,
rank_as_genome_binning=args.rank_as_genome_binning)

load_data.load_ncbi_info(args.ncbi_nodes_file, args.ncbi_names_file)

Expand Down
13 changes: 12 additions & 1 deletion src/binning_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,12 +451,15 @@ def get_metrics_dict(self, gold_standard):


class Options:
def __init__(self, filter_tail_percentage, filter_genomes_file, filter_keyword, map_by_completeness, min_length):
def __init__(self, filter_tail_percentage, filter_genomes_file, filter_keyword, map_by_completeness, min_length, rank_as_genome_binning):
self.__filter_tail_percentage = float(filter_tail_percentage) if filter_tail_percentage else .0
self.__filter_genomes_file = filter_genomes_file
self.__filter_keyword = filter_keyword
self.__map_by_completeness = map_by_completeness
self.__min_length = int(min_length) if min_length else 0
if rank_as_genome_binning and rank_as_genome_binning not in load_ncbi_taxinfo.RANKS:
exit("Not a valid rank to assess taxonomic binning as genome binning (option --rank_as_genome_binning): " + rank_as_genome_binning)
self.__rank_as_genome_binning = rank_as_genome_binning

@property
def filter_tail_percentage(self):
Expand All @@ -478,6 +481,10 @@ def map_by_completeness(self):
def min_length(self):
return self.__min_length

@property
def rank_as_genome_binning(self):
return self.__rank_as_genome_binning

@filter_tail_percentage.setter
def filter_tail_percentage(self, filter_tail_percentage):
self.__filter_tail_percentage = filter_tail_percentage
Expand All @@ -497,3 +504,7 @@ def map_by_completeness(self, map_by_completeness):
@min_length.setter
def min_length(self, min_length):
self.__min_length = min_length

@rank_as_genome_binning.setter
def rank_as_genome_binning(self, rank_as_genome_binning):
self.__rank_as_genome_binning = rank_as_genome_binning
53 changes: 43 additions & 10 deletions src/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,25 +322,55 @@ def load_ncbi_info(ncbi_nodes_file, ncbi_names_file):
binning_classes.TaxonomicQuery.tax_id_to_name = load_ncbi_taxinfo.load_names(binning_classes.TaxonomicQuery.tax_id_to_rank, ncbi_names_file)


def create_genome_queries_from_taxonomic_queries(rank, sample_id_to_g_query, sample_id_to_queries_list):
sample_id_to_genome_queries = defaultdict(list)
for sample_id in sample_id_to_queries_list:
for query in sample_id_to_queries_list[sample_id]:
if isinstance(query, binning_classes.GenomeQuery):
continue
if sample_id not in sample_id_to_g_query:
print("No genome binning gold standard available for sample " + sample_id)
continue
g_query = binning_classes.GenomeQuery()
sample_id_to_genome_queries[sample_id].append(g_query)
g_query.options = query.options
g_query.label = query.label
g_query.gold_standard = sample_id_to_g_query[sample_id]

for sequence_id in query.rank_to_sequence_id_to_bin_id[rank]:
bin_id = query.rank_to_sequence_id_to_bin_id[rank][sequence_id]
if bin_id not in g_query.get_bin_ids():
bin = binning_classes.GenomeBin(bin_id)
g_query.add_bin(bin)
else:
bin = g_query.get_bin_by_id(bin_id)
g_query.sequence_id_to_bin_id = (sequence_id, bin_id)
bin.add_sequence_id(sequence_id, query.gold_standard.sequence_id_to_length[sequence_id])

for sample_id in sample_id_to_genome_queries:
sample_id_to_queries_list[sample_id].extend(sample_id_to_genome_queries[sample_id])


def load_queries(gold_standard_file, fastx_file, query_files, options, labels):
gold_standard = open_query(gold_standard_file,
True,
fastx_file,
None, None,
options,
None)
sample_ids_list, sample_id_to_g_query, sample_id_to_t_query = \
open_query(gold_standard_file,
True,
fastx_file,
None, None,
options,
None)
sample_id_to_num_genomes = None
if gold_standard[1]:
if sample_id_to_g_query:
sample_id_to_num_genomes = {}
for sample_id, g_query in gold_standard[1].items():
for sample_id, g_query in sample_id_to_g_query.items():
sample_id_to_num_genomes[sample_id] = len(g_query.bins)

sample_id_to_queries_list = defaultdict(list)
for query_file, label in zip(query_files, labels):
query = open_query(query_file,
False,
None,
gold_standard[1], gold_standard[2],
sample_id_to_g_query, sample_id_to_t_query,
options,
label)
for sample_id_to_query in [query[1], query[2]]:
Expand All @@ -351,4 +381,7 @@ def load_queries(gold_standard_file, fastx_file, query_files, options, labels):

# TODO if there is a g_query (t_query), there must be a g_gold_standard (t_gold_standard)

return [sample_id for sample_id in gold_standard[0]], sample_id_to_num_genomes, sample_id_to_queries_list
if options.rank_as_genome_binning:
create_genome_queries_from_taxonomic_queries(options.rank_as_genome_binning, sample_id_to_g_query, sample_id_to_queries_list)

return sample_ids_list, sample_id_to_num_genomes, sample_id_to_queries_list
2 changes: 1 addition & 1 deletion version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.0.11-beta'
__version__ = '2.0.12-beta'

0 comments on commit 3a3259a

Please sign in to comment.