From ebe2281f39cd5e35dd12422d85acccbfefe686d2 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 12 Apr 2024 15:22:03 +0100 Subject: [PATCH 1/6] Remove duplicate counts when aggregating per taxonomy or assembly --- .../gather_release_counts.py | 54 ++++++++++++------- ...or_haplochromini_oreochromis_niloticus.log | 6 +++ .../tests/test_gather_release_counts.py | 32 +++++++++-- 3 files changed, 67 insertions(+), 25 deletions(-) create mode 100644 eva-accession-release-automation/gather_clustering_counts/tests/count_for_haplochromini_oreochromis_niloticus.log diff --git a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py index 6bacf56ee..81d504b89 100644 --- a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py @@ -279,7 +279,7 @@ def _write_per_taxonomy_counts(self, session): self.info(f"Create persistence for aggregate per taxonomy {taxonomy_id}") taxonomy_row = RSCountPerTaxonomy( taxonomy_id=taxonomy_id, - assembly_accessions=species_annotation.get(taxonomy_id).get('assemblies'), + assembly_accessions=list(species_annotation.get(taxonomy_id).get('assemblies')), release_folder=species_annotation.get(taxonomy_id).get('release_folder'), release_version=self.release_version, ) @@ -375,6 +375,11 @@ def get_assembly_counts_from_database(self): return results def parse_count_script_logs(self, all_logs): + ''' + Create a list of grouped count + :param all_logs: + :return: + ''' for log_file in all_logs: with open(log_file) as open_file: for line in open_file: @@ -394,31 +399,40 @@ def generate_per_taxonomy_counts(self): species_counts = defaultdict(Counter) species_annotations = defaultdict(dict) for count_groups in self.all_counts_grouped: - for count_dict in count_groups: - species_counts[count_dict['taxonomy']][count_dict['idtype']] += count_dict['count'] - if 'assemblies' not in species_annotations.get(count_dict['taxonomy'], {}): - species_annotations[count_dict['taxonomy']] = { - 'assemblies': set(), - 'release_folder': None - } - - species_annotations[count_dict['taxonomy']]['assemblies'].add(count_dict['assembly']) - species_annotations[count_dict['taxonomy']]['release_folder'] = count_dict['release_folder'] + taxonomy_and_types = set([(count_dict['taxonomy'], count_dict['idtype']) for count_dict in count_groups]) + for taxonomy, rstype in taxonomy_and_types: + if taxonomy not in species_annotations: + species_annotations[taxonomy] = {'assemblies': set(), 'release_folder': None} + # All count_dict have the same count in a group + species_counts[taxonomy][rstype] += count_groups[0]['count'] + species_annotations[taxonomy]['assemblies'].update( + set([ + count_dict['assembly'] + for count_dict in count_groups + if count_dict['taxonomy'] is taxonomy and count_dict['idtype'] is rstype + ]) + ) + species_annotations[taxonomy]['release_folder'] = count_groups[0]['release_folder'] return species_counts, species_annotations def generate_per_assembly_counts(self): assembly_counts = defaultdict(Counter) assembly_annotations = {} for count_groups in self.all_counts_grouped: - for count_dict in count_groups: - assembly_counts[count_dict['assembly']][count_dict['idtype']] += count_dict['count'] - if 'taxonomies' not in assembly_annotations.get(count_dict['assembly'], {}): - assembly_annotations[count_dict['assembly']] = { - 'taxonomies': set(), - 'release_folder': None - } - assembly_annotations[count_dict['assembly']]['taxonomies'].add(count_dict['taxonomy']) - assembly_annotations[count_dict['assembly']]['release_folder'] = count_dict['assembly'] + assembly_and_types = set([(count_dict['assembly'], count_dict['idtype']) for count_dict in count_groups]) + for assembly_accession, rstype in assembly_and_types: + if assembly_accession not in assembly_annotations: + assembly_annotations[assembly_accession] = {'taxonomies': set(), 'release_folder': None} + # All count_dict have the same count in a group + assembly_counts[assembly_accession][rstype] += count_groups[0]['count'] + assembly_annotations[assembly_accession]['taxonomies'].update( + set([ + count_dict['taxonomy'] + for count_dict in count_groups + if count_dict['assembly'] is assembly_accession and count_dict['idtype'] is rstype + ])) + + assembly_annotations[assembly_accession]['release_folder'] = assembly_accession return assembly_counts, assembly_annotations # def generate_per_species_assembly_counts(self): diff --git a/eva-accession-release-automation/gather_clustering_counts/tests/count_for_haplochromini_oreochromis_niloticus.log b/eva-accession-release-automation/gather_clustering_counts/tests/count_for_haplochromini_oreochromis_niloticus.log new file mode 100644 index 000000000..503b2e4b2 --- /dev/null +++ b/eva-accession-release-automation/gather_clustering_counts/tests/count_for_haplochromini_oreochromis_niloticus.log @@ -0,0 +1,6 @@ +18746871 GCA_000188235.2-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current, +4882192 GCA_001858045.3-oreochromis_niloticus-current, + 237 GCA_000188235.2-oreochromis_niloticus-current,GCA_001858045.3-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current, + 71 GCA_000188235.2-oreochromis_niloticus-deprecated,GCA_000188235.2-haplochromini-deprecated, + 17 Unmapped-oreochromis_niloticus-unmapped, + 14 GCA_000188235.2-haplochromini-multimap,GCA_000188235.2-oreochromis_niloticus-multimap, diff --git a/eva-accession-release-automation/gather_clustering_counts/tests/test_gather_release_counts.py b/eva-accession-release-automation/gather_clustering_counts/tests/test_gather_release_counts.py index d9e46186a..747616c63 100644 --- a/eva-accession-release-automation/gather_clustering_counts/tests/test_gather_release_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/tests/test_gather_release_counts.py @@ -1,4 +1,5 @@ import os +from itertools import cycle from unittest import TestCase from unittest.mock import patch @@ -52,8 +53,11 @@ def test_write_counts_to_db(self): log_files_release1 = [os.path.join(self.resource_folder, 'count_for_release1.log')] log_files_release2 = [os.path.join(self.resource_folder, 'count_for_release2.log')] list_cow_assemblies = ['GCA_000003055.3', 'GCA_000003055.5', 'GCA_000003205.1', 'GCA_000003205.4', 'GCA_000003205.6', 'Unmapped'] - with patch.object(ReleaseCounter, 'get_taxonomy_and_scientific_name') as ptaxonomy: - ptaxonomy.return_value = (9913, 'Bos taurus') + folder_to_taxonomy = {'bos_taurus': 9913} + + with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy: + # ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x) + ptaxonomy.return_value = 9913 counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile, release_version=1, logs=log_files_release1) counter.write_counts_to_db() @@ -67,7 +71,7 @@ def test_write_counts_to_db(self): result = session.execute(query).fetchone() rs_taxonomy_count = result.RSCountPerTaxonomy assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies - assert rs_taxonomy_count.current_rs == 169904286 + assert rs_taxonomy_count.current_rs == 102813585 assert rs_taxonomy_count.new_current_rs == 0 assert rs_taxonomy_count.release_folder == 'Cow_9913' @@ -76,8 +80,8 @@ def test_write_counts_to_db(self): result = session.execute(query).fetchone() rs_taxonomy_count = result.RSCountPerTaxonomy assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies - assert rs_taxonomy_count.current_rs == 169101573 - assert rs_taxonomy_count.new_current_rs == -802713 + assert rs_taxonomy_count.current_rs == 102605893 + assert rs_taxonomy_count.new_current_rs == -207692 assert rs_taxonomy_count.release_folder == 'bos_taurus' query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000003205.6', @@ -89,3 +93,21 @@ def test_write_counts_to_db(self): assert rs_assembly_count.new_current_rs == 0 assert rs_assembly_count.release_folder == 'GCA_000003205.6' + def test_write_counts_to_db2(self): + log_files_release = [os.path.join(self.resource_folder, 'count_for_haplochromini_oreochromis_niloticus.log')] + folder_to_taxonomy = {'oreochromis_niloticus': 8128, 'haplochromini': 319058} + + with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy: + ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x) + counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile, + release_version=4, logs=log_files_release) + counter.write_counts_to_db() + session = Session(counter.sqlalchemy_engine) + + query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000188235.2', + RSCountPerAssembly.release_version == 4) + result = session.execute(query).fetchone() + rs_assembly_count = result.RSCountPerAssembly + assert rs_assembly_count.current_rs == 18747108 # 18746871 + 237 + assert rs_assembly_count.release_folder == 'GCA_000188235.2' + From fd0a63c2b24098c9ce949001a6aa5b1957ee447c Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 12 Apr 2024 15:33:30 +0100 Subject: [PATCH 2/6] Fix requirements.txt --- eva-accession-release-automation/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eva-accession-release-automation/requirements.txt b/eva-accession-release-automation/requirements.txt index 31582abfa..18d616e46 100644 --- a/eva-accession-release-automation/requirements.txt +++ b/eva-accession-release-automation/requirements.txt @@ -1,4 +1,4 @@ -ebi_eva_common_pyutils[internal]==0.6.5 +ebi_eva_common_pyutils[eva-internal]==0.6.5 click==7.1.2 pytz==2022.6 pyyaml==5.3.1 From 10ed1734c7eba5f062709dacceed2309f167391f Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 12 Apr 2024 15:49:16 +0100 Subject: [PATCH 3/6] Update to use new version of ebi_eva_common_pyutils --- .../gather_clustering_counts_from_mongo.py | 8 ++++---- .../gather_per_species_clustering_counts.py | 4 ++-- .../gather_clustering_counts/gather_release_counts.py | 6 +++--- .../load_additional_counts_from_tsv.py | 4 ++-- .../gather_clustering_counts/qc_release_counts.py | 4 ++-- .../publish_release_files_to_ftp.py | 4 ++-- .../copy_accessioning_collections_to_embassy.py | 6 +++--- .../create_release_properties_file.py | 4 ++-- .../create_release_tracking_table.py | 8 ++++---- .../create_table_for_multimap_snps.py | 2 +- .../initiate_release_status_for_assembly.py | 2 +- .../merge_dbsnp_eva_release_files.py | 2 +- .../run_release_in_embassy/release_common_utils.py | 2 +- .../run_release_in_embassy/release_metadata.py | 2 +- .../run_release_in_embassy/run_release_for_species.py | 2 +- .../update_release_status_for_assembly.py | 2 +- .../run_release_in_embassy/validate_release_vcf_files.py | 2 +- 17 files changed, 32 insertions(+), 32 deletions(-) diff --git a/eva-accession-release-automation/gather_clustering_counts/gather_clustering_counts_from_mongo.py b/eva-accession-release-automation/gather_clustering_counts/gather_clustering_counts_from_mongo.py index 4908b4274..204449c7d 100644 --- a/eva-accession-release-automation/gather_clustering_counts/gather_clustering_counts_from_mongo.py +++ b/eva-accession-release-automation/gather_clustering_counts/gather_clustering_counts_from_mongo.py @@ -8,10 +8,10 @@ from datetime import datetime from ebi_eva_common_pyutils.logger import logging_config -from ebi_eva_common_pyutils.mongodb import MongoDatabase -from ebi_eva_common_pyutils.config_utils import get_accession_pg_creds_for_profile -from ebi_eva_common_pyutils.pg_utils import execute_query, get_all_results_for_query -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.mongodb import MongoDatabase +from ebi_eva_internal_pyutils.config_utils import get_accession_pg_creds_for_profile +from ebi_eva_internal_pyutils.pg_utils import execute_query, get_all_results_for_query +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from gather_clustering_counts.gather_per_species_clustering_counts import assembly_table_name, tracker_table_name from urllib.parse import urlsplit diff --git a/eva-accession-release-automation/gather_clustering_counts/gather_per_species_clustering_counts.py b/eva-accession-release-automation/gather_clustering_counts/gather_per_species_clustering_counts.py index a66313db5..0406f52a7 100644 --- a/eva-accession-release-automation/gather_clustering_counts/gather_per_species_clustering_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/gather_per_species_clustering_counts.py @@ -2,9 +2,9 @@ import os from ebi_eva_common_pyutils.logger import logging_config -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from ebi_eva_common_pyutils.command_utils import run_command_with_output -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query logger = logging_config.get_logger(__name__) logging_config.add_stdout_handler() diff --git a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py index 81d504b89..30ffe11f8 100644 --- a/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/gather_release_counts.py @@ -8,10 +8,10 @@ from ebi_eva_common_pyutils.command_utils import run_command_with_output from ebi_eva_common_pyutils.common_utils import pretty_print -from ebi_eva_common_pyutils.config_utils import get_metadata_creds_for_profile from ebi_eva_common_pyutils.logger import logging_config, AppLogger -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query +from ebi_eva_internal_pyutils.config_utils import get_metadata_creds_for_profile +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query from sqlalchemy import select from sqlalchemy.orm import Session diff --git a/eva-accession-release-automation/gather_clustering_counts/load_additional_counts_from_tsv.py b/eva-accession-release-automation/gather_clustering_counts/load_additional_counts_from_tsv.py index 25f79d11d..64ba1bff5 100644 --- a/eva-accession-release-automation/gather_clustering_counts/load_additional_counts_from_tsv.py +++ b/eva-accession-release-automation/gather_clustering_counts/load_additional_counts_from_tsv.py @@ -1,9 +1,9 @@ import argparse import csv -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from ebi_eva_common_pyutils.logger import logging_config -from ebi_eva_common_pyutils.pg_utils import execute_query +from ebi_eva_internal_pyutils.pg_utils import execute_query from gather_clustering_counts.gather_per_species_clustering_counts import assembly_table_name diff --git a/eva-accession-release-automation/gather_clustering_counts/qc_release_counts.py b/eva-accession-release-automation/gather_clustering_counts/qc_release_counts.py index 603bc42c8..7dc0400a3 100644 --- a/eva-accession-release-automation/gather_clustering_counts/qc_release_counts.py +++ b/eva-accession-release-automation/gather_clustering_counts/qc_release_counts.py @@ -7,8 +7,8 @@ from ebi_eva_common_pyutils.common_utils import pretty_print from ebi_eva_common_pyutils.logger import logging_config -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query from gather_clustering_counts.gather_per_species_clustering_counts import get_taxonomy_and_scientific_name, \ assembly_table_name, id_to_column diff --git a/eva-accession-release-automation/publish_release_to_ftp/publish_release_files_to_ftp.py b/eva-accession-release-automation/publish_release_to_ftp/publish_release_files_to_ftp.py index 8706b470c..fddf06bca 100644 --- a/eva-accession-release-automation/publish_release_to_ftp/publish_release_files_to_ftp.py +++ b/eva-accession-release-automation/publish_release_to_ftp/publish_release_files_to_ftp.py @@ -23,8 +23,8 @@ from publish_release_to_ftp.create_assembly_name_symlinks import create_assembly_name_symlinks from ebi_eva_common_pyutils.command_utils import run_command_with_output from ebi_eva_common_pyutils.logger import logging_config -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query from run_release_in_embassy.run_release_for_species import get_common_release_properties from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories diff --git a/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py b/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py index 6bf3ab244..dc2c50c7f 100644 --- a/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py +++ b/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py @@ -19,9 +19,9 @@ import traceback from ebi_eva_common_pyutils.command_utils import run_command_with_output -from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.mongo_utils import copy_db +from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.mongo_utils import copy_db from pymongo import MongoClient from pymongo.uri_parser import parse_uri from run_release_in_embassy.release_common_utils import open_mongo_port_to_tempmongo, close_mongo_port_to_tempmongo, \ diff --git a/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py b/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py index 72d160e25..c12e307f6 100644 --- a/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py +++ b/eva-accession-release-automation/run_release_in_embassy/create_release_properties_file.py @@ -14,8 +14,8 @@ import os import click -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.spring_properties import SpringPropertiesGenerator +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.spring_properties import SpringPropertiesGenerator from run_release_in_embassy.release_common_utils import get_release_db_name_in_tempmongo_instance from run_release_in_embassy.release_metadata import get_release_inventory_info_for_assembly diff --git a/eva-accession-release-automation/run_release_in_embassy/create_release_tracking_table.py b/eva-accession-release-automation/run_release_in_embassy/create_release_tracking_table.py index 9d629cdd9..85d6523ad 100644 --- a/eva-accession-release-automation/run_release_in_embassy/create_release_tracking_table.py +++ b/eva-accession-release-automation/run_release_in_embassy/create_release_tracking_table.py @@ -18,11 +18,11 @@ from itertools import cycle from ebi_eva_common_pyutils.assembly import NCBIAssembly -from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile +from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile from ebi_eva_common_pyutils.logger import logging_config, AppLogger -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.mongodb import MongoDatabase -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.mongodb import MongoDatabase +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query from ebi_eva_common_pyutils.taxonomy.taxonomy import normalise_taxon_scientific_name, get_scientific_name_from_ensembl diff --git a/eva-accession-release-automation/run_release_in_embassy/create_table_for_multimap_snps.py b/eva-accession-release-automation/run_release_in_embassy/create_table_for_multimap_snps.py index eda984ffb..35feb6624 100644 --- a/eva-accession-release-automation/run_release_in_embassy/create_table_for_multimap_snps.py +++ b/eva-accession-release-automation/run_release_in_embassy/create_table_for_multimap_snps.py @@ -19,7 +19,7 @@ from include_mapping_weight_from_dbsnp.snpmapinfo_metadata import \ get_snpmapinfo_tables_with_overweight_snps_for_dbsnp_species, get_snpmapinfo_asm_columns from include_mapping_weight_from_dbsnp.dbsnp_mirror_metadata import get_db_conn_for_species, get_species_info -from ebi_eva_common_pyutils.pg_utils import get_pg_connection_handle, execute_query, create_index_on_table, \ +from ebi_eva_internal_pyutils.pg_utils import get_pg_connection_handle, execute_query, create_index_on_table, \ vacuum_analyze_table diff --git a/eva-accession-release-automation/run_release_in_embassy/initiate_release_status_for_assembly.py b/eva-accession-release-automation/run_release_in_embassy/initiate_release_status_for_assembly.py index 6a2fcddc2..2223cde57 100644 --- a/eva-accession-release-automation/run_release_in_embassy/initiate_release_status_for_assembly.py +++ b/eva-accession-release-automation/run_release_in_embassy/initiate_release_status_for_assembly.py @@ -17,7 +17,7 @@ from run_release_in_embassy.release_metadata import update_release_progress_status -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle logger = logging.getLogger(__name__) diff --git a/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py b/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py index 9a001d8ff..ad9da4c16 100644 --- a/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/merge_dbsnp_eva_release_files.py @@ -18,7 +18,7 @@ import os from ebi_eva_common_pyutils.command_utils import run_command_with_output -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories, \ get_release_inventory_info_for_assembly from run_release_in_embassy.release_common_utils import get_bgzip_bcftools_index_commands_for_file, \ diff --git a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py index f29ae684a..ab6f6cade 100644 --- a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py +++ b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py @@ -18,7 +18,7 @@ import traceback from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from ebi_eva_common_pyutils.network_utils import get_available_local_port, forward_remote_port_to_local_port from ebi_eva_common_pyutils.taxonomy import taxonomy diff --git a/eva-accession-release-automation/run_release_in_embassy/release_metadata.py b/eva-accession-release-automation/run_release_in_embassy/release_metadata.py index d7e1a8278..707adafb0 100644 --- a/eva-accession-release-automation/run_release_in_embassy/release_metadata.py +++ b/eva-accession-release-automation/run_release_in_embassy/release_metadata.py @@ -13,7 +13,7 @@ # limitations under the License. import datetime -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query +from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query release_vcf_file_categories = ["current_ids", "merged_ids"] release_text_file_categories = ["deprecated_ids", "merged_deprecated_ids"] diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index f0e782d24..472768817 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -21,7 +21,7 @@ import yaml from ebi_eva_common_pyutils.common_utils import merge_two_dicts -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from run_release_in_embassy.release_metadata import get_release_assemblies_for_taxonomy from run_release_in_embassy.release_common_utils import get_release_folder_name diff --git a/eva-accession-release-automation/run_release_in_embassy/update_release_status_for_assembly.py b/eva-accession-release-automation/run_release_in_embassy/update_release_status_for_assembly.py index 28533b01d..3f59450a1 100644 --- a/eva-accession-release-automation/run_release_in_embassy/update_release_status_for_assembly.py +++ b/eva-accession-release-automation/run_release_in_embassy/update_release_status_for_assembly.py @@ -17,7 +17,7 @@ from run_release_in_embassy.release_metadata import update_release_progress_status -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle logger = logging.getLogger(__name__) diff --git a/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py b/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py index 863524184..cdf91bd59 100644 --- a/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py +++ b/eva-accession-release-automation/run_release_in_embassy/validate_release_vcf_files.py @@ -19,7 +19,7 @@ from run_release_in_embassy.release_metadata import get_release_inventory_info_for_assembly, \ release_vcf_file_categories, vcf_validation_output_file_pattern, asm_report_output_file_pattern from ebi_eva_common_pyutils.command_utils import run_command_with_output -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle def remove_index_if_outdated(fasta_path): From ea48d6fe09ce2760c04eee3a61670f591aa8d189 Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 15 Apr 2024 09:17:52 +0100 Subject: [PATCH 4/6] Separate RS from multiple lines --- .../gather_clustering_counts/bash/count_rs_for_all_files.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh index 4f8ee1b82..9ed94b2f8 100755 --- a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh +++ b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh @@ -19,7 +19,8 @@ do OUTPUT=tmp_${SC_NAME}_${ASSEMBLY}_${TYPE}.txt if [[ ${INPUT} == *.vcf.gz ]] then - zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $3" "annotation}' > ${OUTPUT} + # There are sometime multiple rs (separated by ;) in one line that needs to be split across multiple lines + zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{gsub(";","\n",$3); print $3" "annotation}' > ${OUTPUT} elif [[ ${INPUT} == *_unmapped_ids.txt.gz ]] then SC_NAME=$(basename $(dirname ${INPUT})); From 7e515c7ab85417c5a0db2394a5c254d9f1911d95 Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 15 Apr 2024 09:23:01 +0100 Subject: [PATCH 5/6] Apply annotation to all split RS --- .../gather_clustering_counts/bash/count_rs_for_all_files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh index 9ed94b2f8..75c7407b9 100755 --- a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh +++ b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh @@ -20,7 +20,7 @@ do if [[ ${INPUT} == *.vcf.gz ]] then # There are sometime multiple rs (separated by ;) in one line that needs to be split across multiple lines - zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{gsub(";","\n",$3); print $3" "annotation}' > ${OUTPUT} + zcat "${INPUT}" | grep -v '^#' | awk '{gsub(";","\n",$3); print $3}' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $0" "annotation}' > ${OUTPUT} elif [[ ${INPUT} == *_unmapped_ids.txt.gz ]] then SC_NAME=$(basename $(dirname ${INPUT})); From 9b65aae7380be1b625a4a67a5ef9e1572543ee1a Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 16 Apr 2024 13:45:16 +0100 Subject: [PATCH 6/6] Make script compatible with macos tooling --- .../gather_clustering_counts/bash/count_rs_for_all_files.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh index 75c7407b9..c846136e1 100755 --- a/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh +++ b/eva-accession-release-automation/gather_clustering_counts/bash/count_rs_for_all_files.sh @@ -20,14 +20,14 @@ do if [[ ${INPUT} == *.vcf.gz ]] then # There are sometime multiple rs (separated by ;) in one line that needs to be split across multiple lines - zcat "${INPUT}" | grep -v '^#' | awk '{gsub(";","\n",$3); print $3}' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $0" "annotation}' > ${OUTPUT} + gzip -d -c "${INPUT}" | grep -v '^#' | awk '{gsub(";","\n",$3); print $3}' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $0" "annotation}' > ${OUTPUT} elif [[ ${INPUT} == *_unmapped_ids.txt.gz ]] then SC_NAME=$(basename $(dirname ${INPUT})); OUTPUT=tmp_${SC_NAME}_unmapped.txt - zcat "${INPUT}" | grep -v '^#' | awk -v annotation="Unmapped-${SC_NAME}-unmapped" '{print $1" "annotation}' > ${OUTPUT} + gzip -d -c "${INPUT}" | grep -v '^#' | awk -v annotation="Unmapped-${SC_NAME}-unmapped" '{print $1" "annotation}' > ${OUTPUT} else - zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $1" "annotation}' > ${OUTPUT} + gzip -d -c "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $1" "annotation}' > ${OUTPUT} fi ALL_TMP_OUTPUT=$OUTPUT" "$ALL_TMP_OUTPUT done