From d0f1748098e6d136d4c6231a8b5d282f5df053d7 Mon Sep 17 00:00:00 2001 From: Ivan Kavaliou Date: Fri, 3 Mar 2017 10:56:09 +0300 Subject: [PATCH] #87 added db/csv config of the parser --- .../data_analytics/csv_cards_analyzer.py | 68 ++++++++++++++++--- .../scripts/data_analytics/csv_parser.py | 2 +- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/nextgisbio/scripts/data_analytics/csv_cards_analyzer.py b/nextgisbio/scripts/data_analytics/csv_cards_analyzer.py index 2d70c963..91fccacf 100644 --- a/nextgisbio/scripts/data_analytics/csv_cards_analyzer.py +++ b/nextgisbio/scripts/data_analytics/csv_cards_analyzer.py @@ -33,10 +33,6 @@ from csv_parser import parse_data -conf_parser = { - 'csv': True -} - data_structure = [ { 'main': { @@ -54,6 +50,54 @@ 'db_table_name': Person.name } }, + { + 'main': { + 'table': 'cards.csv', + 'field': 'observer', + 'db_table': Cards, + 'db_table_id': Cards.observer + }, + 'relation': { + 'table': 'person.csv', + 'id_field': 'id', + 'name_field': 'name', + 'db_table': Person, + 'db_table_id': Person.id, + 'db_table_name': Person.name + } + }, + { + 'main': { + 'table': 'cards.csv', + 'field': 'footprint', + 'db_table': Cards, + 'db_table_id': Cards.footprint + }, + 'relation': { + 'table': 'footprint.csv', + 'id_field': 'id', + 'name_field': 'footprint', + 'db_table': Footprint, + 'db_table_id': Footprint.id, + 'db_table_name': Footprint.footprint + } + }, + { + 'main': { + 'table': 'cards.csv', + 'field': 'pheno', + 'db_table': Cards, + 'db_table_id': Cards.pheno + }, + 'relation': { + 'table': 'pheno.csv', + 'id_field': 'id', + 'name_field': 'pheno', + 'db_table': Pheno, + 'db_table_id': Pheno.id, + 'db_table_name': Pheno.pheno + } + }, { 'main': { 'table': 'cards.csv', @@ -73,7 +117,7 @@ ] -def analyze(csv_data): +def analyze(csv_data, conf_parser): for data_structure_item in data_structure: if 'csv' in conf_parser: csv_handle(csv_data, data_structure_item) @@ -149,17 +193,23 @@ def db_handle(csv_data, data_structure_item): relation_db_aggregated[relation_db_id]['count'] += 1 else: relation_db_aggregated[relation_db_id] = { + 'id': relation_db_id, 'name': db_item[2], 'count': 1 } + relation_db_aggregated_items = sorted(relation_db_aggregated.values(), + key=lambda(v): v['count'], + reverse=True) print '\n -------------------' - print data_structure_item['main']['table'] + ' -> ' + data_structure_item['relation']['table'] + print 'DB - ' + data_structure_item['main']['table'] + ' -> ' + data_structure_item['relation']['table'] print 'by field "' + main_field + '"' print '-------------------' - for k in relation_db_aggregated: - print u'{0} - {1} = {2}'.format(k, relation_db_aggregated[k]['name'], relation_db_aggregated[k]['count']) + for relation_db_aggregated_item in relation_db_aggregated_items: + print u'{0} - {1} = {2}'.format(relation_db_aggregated_item['id'], + relation_db_aggregated_item['name'], + relation_db_aggregated_item['count']) def usage(argv): @@ -183,7 +233,7 @@ def main(argv=sys.argv): rel_path_source_data = "./csv/" source_data_dir = os.path.join(script_dir, rel_path_source_data) csv_data = parse_data(source_data_dir) - analyze(csv_data) + analyze(csv_data, conf_parser) if __name__ == "__main__": diff --git a/nextgisbio/scripts/data_analytics/csv_parser.py b/nextgisbio/scripts/data_analytics/csv_parser.py index 4bc2a396..d76c340a 100644 --- a/nextgisbio/scripts/data_analytics/csv_parser.py +++ b/nextgisbio/scripts/data_analytics/csv_parser.py @@ -52,6 +52,6 @@ def parse_data(data_dir_name): from operator import itemgetter data[csv_file]['records'] = sorted(data[csv_file]['records'], key=itemgetter(0)) - print '%s parsed' % csv_file + # print '%s parsed' % csv_file return data