From 9adcaedcdd541118c6da5b58cbb4cc0d3ce39006 Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 7 May 2024 17:28:05 +0100 Subject: [PATCH 1/8] basic labels --- .../run_release_for_species.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index 472768817..8754bce0c 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -64,7 +64,8 @@ workflow_process_template_for_nextflow = """ process {workflow-process-name} {{ - {cluster-options} + label: '{label-time}', '{label-memory}' + input: val flag from {previous-process-output-flag} output: @@ -109,8 +110,8 @@ def get_nextflow_process_definition(assembly_release_properties, workflow_proces " ".join(["--{0} {1}" .format(arg, release_properties[arg]) for arg in workflow_process_args])) - release_properties["cluster-options"] = 'clusterOptions "-g /accession"' if "run_release_for_assembly" \ - in workflow_process_name else "" + release_properties['label-time'] = 'long_time' + release_properties['label-memory'] = 'med_mem' return workflow_process_template_for_nextflow.format(**release_properties) @@ -141,9 +142,13 @@ def prepare_release_workflow_file_for_species(common_release_properties, taxonom release_assembly_properties["current-process-output-flag"] = "flag" + str(process_index) workflow_file_handle.write( - get_nextflow_process_definition(release_assembly_properties, workflow_process_name, - workflow_process_args, - process_name_suffix=assembly_accession.replace(".", "_"))) + get_nextflow_process_definition( + release_assembly_properties, + workflow_process_name, + workflow_process_args, + process_name_suffix=assembly_accession.replace(".", "_") + ) + ) workflow_file_handle.write("\n") process_index += 1 # Set the flag that will capture the output status of the current process From e9674f2ba1ca2b5e9db16a00e35bdb6a7c456427 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 8 May 2024 13:31:22 +0100 Subject: [PATCH 2/8] Add basic labels and fix one command --- .../copy_accessioning_collections_to_embassy.py | 2 +- .../run_release_in_embassy/run_release_for_species.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py b/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py index dc2c50c7f..7443e923e 100644 --- a/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py +++ b/eva-accession-release-automation/run_release_in_embassy/copy_accessioning_collections_to_embassy.py @@ -52,7 +52,7 @@ def remote_db_is_empty(local_forwarded_port, assembly_accession, destination_db_ logger.info(f"check if: {assembly_accession} has been copied") command = f"mongosh --eval 'use {destination_db_name}' --eval 'db.stats()' --port {local_forwarded_port} " \ "| grep 'dataSize' | awk '{print substr($2,1,length($2)-1)}'" - output = run_command_with_output(command) + output = run_command_with_output('Check that data exists in mongodb', command, return_process_output=True) return output.strip() == '0' diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index 8754bce0c..b4e18720c 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -64,7 +64,7 @@ workflow_process_template_for_nextflow = """ process {workflow-process-name} {{ - label: '{label-time}', '{label-memory}' + label '{label-time}', '{label-memory}' input: val flag from {previous-process-output-flag} From 4e921f249da4d95489c0f8fcb9298964071612f6 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 8 May 2024 14:01:37 +0100 Subject: [PATCH 3/8] add tasks specific labels --- .../run_release_for_species.py | 105 ++++++++++++------ 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index b4e18720c..81a377ff0 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -31,35 +31,64 @@ copy_process = "copy_accessioning_collections_to_embassy" # Processes, in order, that make up the workflow and the arguments that they take workflow_process_arguments_map = collections.OrderedDict( - [ - ("initiate_release_status_for_assembly", ["private-config-xml-file", "profile", "release-species-inventory-table", - "taxonomy-id", "assembly-accession", "release-version"]), - (copy_process, ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "dump-dir"]), - ("run_release_for_assembly", ["private-config-xml-file", "profile", "taxonomy-id", - "assembly-accession", "release-species-inventory-table", - "release-version", "species-release-folder", "release-jar-path"]), - ("merge_dbsnp_eva_release_files", ["private-config-xml-file", "profile", "bgzip-path", "bcftools-path", - "vcf-sort-script-path", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", - "species-release-folder"]), - ("sort_bgzip_index_release_files", ["bgzip-path", "bcftools-path", - "vcf-sort-script-path", "taxonomy-id", "assembly-accession", - "species-release-folder"]), - ("validate_release_vcf_files", ["private-config-xml-file", "profile", "taxonomy-id", - "assembly-accession", "release-species-inventory-table", "release-version", - "species-release-folder", - "vcf-validator-path", "assembly-checker-path"]), - ("analyze_vcf_validation_results", ["species-release-folder", "assembly-accession"]), - ("count_rs_ids_in_release_files", ["count-ids-script-path", "taxonomy-id", "assembly-accession", - "species-release-folder"]), - ("validate_rs_release_files", ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "species-release-folder"]), - ('update_sequence_names_to_ena', ["taxonomy-id", "assembly-accession", "species-release-folder", - 'sequence-name-converter-path', 'bcftools-path']), - ("update_release_status_for_assembly", ["private-config-xml-file", "profile", "release-species-inventory-table", - "taxonomy-id", "assembly-accession", "release-version"]) - ]) + { + "initiate_release_status_for_assembly": ( + ["private-config-xml-file", "profile", "release-species-inventory-table", "taxonomy-id", + "assembly-accession", "release-version"], + 'short_time', 'small_mem' + ), + copy_process: ( + ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", + "release-species-inventory-table", "release-version", "dump-dir"], + 'long_time', 'small_mem' + ), + "run_release_for_assembly":( + ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", + "release-species-inventory-table", "release-version", "species-release-folder", "release-jar-path"], + 'long_time', 'med_mem' + ), + "merge_dbsnp_eva_release_files": ( + ["private-config-xml-file", "profile", "bgzip-path", "bcftools-path", "vcf-sort-script-path", + "taxonomy-id", "assembly-accession", "release-species-inventory-table", "release-version", + "species-release-folder"], + 'long_time', 'med_mem' + ), + "sort_bgzip_index_release_files": ( + ["bgzip-path", "bcftools-path", "vcf-sort-script-path", "taxonomy-id", "assembly-accession", + "species-release-folder"], + 'long_time', 'med_mem' + ), + "validate_release_vcf_files": ( + ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", + "release-species-inventory-table", "release-version", "species-release-folder", + "vcf-validator-path", "assembly-checker-path"], + 'long_time', 'med_mem' + ), + "analyze_vcf_validation_results": ( + ["species-release-folder", "assembly-accession"], + 'long_time', 'med_mem' + ), + "count_rs_ids_in_release_files": ( + ["count-ids-script-path", "taxonomy-id", "assembly-accession", "species-release-folder"], + 'long_time', 'med_mem' + ), + "validate_rs_release_files": ( + ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", + "release-species-inventory-table", "release-version", "species-release-folder"], + 'long_time', 'med_mem' + ), + 'update_sequence_names_to_ena': ( + ["taxonomy-id", "assembly-accession", "species-release-folder",'sequence-name-converter-path', + 'bcftools-path'], + 'long_time', 'med_mem' + ), + "update_release_status_for_assembly": ( + ["private-config-xml-file", "profile", "release-species-inventory-table", "taxonomy-id", + "assembly-accession", "release-version"], + 'short_time', 'small_mem' + ) + } +) workflow_process_template_for_nextflow = """ process {workflow-process-name} {{ @@ -100,7 +129,7 @@ def get_release_properties_for_current_assembly(species_release_properties, asse def get_nextflow_process_definition(assembly_release_properties, workflow_process_name, workflow_process_args, - process_name_suffix=None): + process_name_suffix=None, label_time=None, label_memory=None): if process_name_suffix is None: process_name_suffix = assembly_release_properties["assembly-accession"].replace('.', '_') release_properties = copy.deepcopy(assembly_release_properties) @@ -110,8 +139,8 @@ def get_nextflow_process_definition(assembly_release_properties, workflow_proces " ".join(["--{0} {1}" .format(arg, release_properties[arg]) for arg in workflow_process_args])) - release_properties['label-time'] = 'long_time' - release_properties['label-memory'] = 'med_mem' + release_properties['label-time'] = label_time or 'long_time' + release_properties['label-memory'] = label_memory or 'med_mem' return workflow_process_template_for_nextflow.format(**release_properties) @@ -138,7 +167,13 @@ def prepare_release_workflow_file_for_species(common_release_properties, taxonom header = "#!/usr/bin/env nextflow" workflow_file_handle.write(header + "\n") - for workflow_process_name, workflow_process_args in workflow_process_arguments_map.items(): + for workflow_process_name, workflow_process_info in workflow_process_arguments_map.items(): + workflow_process_args = workflow_process_info[0] + label_time = label_memory = None + if len(workflow_process_info) > 1: + label_time = workflow_process_info[1] + if len(workflow_process_info) > 2: + label_memory = workflow_process_info[2] release_assembly_properties["current-process-output-flag"] = "flag" + str(process_index) workflow_file_handle.write( @@ -146,7 +181,9 @@ def prepare_release_workflow_file_for_species(common_release_properties, taxonom release_assembly_properties, workflow_process_name, workflow_process_args, - process_name_suffix=assembly_accession.replace(".", "_") + process_name_suffix=assembly_accession.replace(".", "_"), + label_time=label_time, + label_memory=label_memory ) ) workflow_file_handle.write("\n") From 8006d4f7a57e5920135f0c28753ce19feecbcb97 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 9 May 2024 09:04:52 +0100 Subject: [PATCH 4/8] Refactor to static nextflow process --- .../release_common_utils.py | 2 + .../run_release_for_assembly.nf | 175 +++++++++++ .../run_release_for_species.py | 276 +++++------------- 3 files changed, 255 insertions(+), 198 deletions(-) create mode 100644 eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf diff --git a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py index ab6f6cade..608080afe 100644 --- a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py +++ b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py @@ -16,6 +16,7 @@ import os import signal import traceback +from functools import lru_cache from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle @@ -90,5 +91,6 @@ def get_release_db_name_in_tempmongo_instance(taxonomy_id, assembly_accession): return "acc_" + str(taxonomy_id) + "_" + assembly_accession.replace('.', '_') +@lru_cache def get_release_folder_name(taxonomy_id): return taxonomy.get_normalized_scientific_name_from_ensembl(taxonomy_id) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf new file mode 100644 index 000000000..676aa3016 --- /dev/null +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf @@ -0,0 +1,175 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +workflow { + initiate_release_status_for_assembly | copy_accessioning_collections_to_embassy | run_release_for_assembly | \ + merge_dbsnp_eva_release_files | sort_bgzip_index_release_files | validate_release_vcf_files | \ + analyze_vcf_validation_results | count_rs_ids_in_release_files | validate_rs_release_files | \ + update_sequence_names_to_ena | update_release_status_for_assembly +} + +process initiate_release_status_for_assembly { + + label 'short_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.initiate_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 + """ +} + +process copy_accessioning_collections_to_embassy { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.copy_accessioning_collections_to_embassy --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --dump-dir $params.dump_dir 1>> $params.log_file 2>&1 + """ +} + +process run_release_for_assembly { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1 + """ +} + +process merge_dbsnp_eva_release_files { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + """ +} + +process sort_bgzip_index_release_files { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + """ +} + +process validate_release_vcf_files { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1 + """ +} + +process analyze_vcf_validation_results { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1 + """ +} + +process count_rs_ids_in_release_files { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + """ +} + +process validate_rs_release_files { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + """ +} + +process update_sequence_names_to_ena { + + label 'long_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1 + """ +} + +process update_release_status_for_assembly { + + label 'short_time', 'med_mem' + + input: + val flag + output: + val true, emit: flag11 + script: + """ + export PYTHONPATH=$params.python_path + $params.python_script -m run_release_in_embassy.update_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 + """ +} diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index 81a377ff0..ece259e4f 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -13,218 +13,92 @@ # limitations under the License. from argparse import ArgumentParser -import collections -import copy -import datetime -import logging import os -import yaml -from ebi_eva_common_pyutils.common_utils import merge_two_dicts +import yaml +from ebi_eva_common_pyutils.command_utils import run_command_with_output +from ebi_eva_common_pyutils.logger import logging_config from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle +from ebi_eva_common_pyutils.config import cfg from run_release_in_embassy.release_metadata import get_release_assemblies_for_taxonomy from run_release_in_embassy.release_common_utils import get_release_folder_name -logger = logging.getLogger(__name__) -timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") -copy_process = "copy_accessioning_collections_to_embassy" -# Processes, in order, that make up the workflow and the arguments that they take -workflow_process_arguments_map = collections.OrderedDict( - { - "initiate_release_status_for_assembly": ( - ["private-config-xml-file", "profile", "release-species-inventory-table", "taxonomy-id", - "assembly-accession", "release-version"], - 'short_time', 'small_mem' - ), - copy_process: ( - ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "dump-dir"], - 'long_time', 'small_mem' - ), - "run_release_for_assembly":( - ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "species-release-folder", "release-jar-path"], - 'long_time', 'med_mem' - ), - "merge_dbsnp_eva_release_files": ( - ["private-config-xml-file", "profile", "bgzip-path", "bcftools-path", "vcf-sort-script-path", - "taxonomy-id", "assembly-accession", "release-species-inventory-table", "release-version", - "species-release-folder"], - 'long_time', 'med_mem' - ), - "sort_bgzip_index_release_files": ( - ["bgzip-path", "bcftools-path", "vcf-sort-script-path", "taxonomy-id", "assembly-accession", - "species-release-folder"], - 'long_time', 'med_mem' - ), - "validate_release_vcf_files": ( - ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "species-release-folder", - "vcf-validator-path", "assembly-checker-path"], - 'long_time', 'med_mem' - ), - "analyze_vcf_validation_results": ( - ["species-release-folder", "assembly-accession"], - 'long_time', 'med_mem' - ), - "count_rs_ids_in_release_files": ( - ["count-ids-script-path", "taxonomy-id", "assembly-accession", "species-release-folder"], - 'long_time', 'med_mem' - ), - "validate_rs_release_files": ( - ["private-config-xml-file", "profile", "taxonomy-id", "assembly-accession", - "release-species-inventory-table", "release-version", "species-release-folder"], - 'long_time', 'med_mem' - ), - 'update_sequence_names_to_ena': ( - ["taxonomy-id", "assembly-accession", "species-release-folder",'sequence-name-converter-path', - 'bcftools-path'], - 'long_time', 'med_mem' - ), - "update_release_status_for_assembly": ( - ["private-config-xml-file", "profile", "release-species-inventory-table", "taxonomy-id", - "assembly-accession", "release-version"], - 'short_time', 'small_mem' - ) - } -) - -workflow_process_template_for_nextflow = """ -process {workflow-process-name} {{ - - label '{label-time}', '{label-memory}' - - input: - val flag from {previous-process-output-flag} - output: - val true into {current-process-output-flag} - script: - \"\"\" - export PYTHONPATH={script-path} && ({python3-path} -m run_release_in_embassy.{process-with-args} 1>> {release-log-file} 2>&1) - \"\"\" -}} -""" - - -def get_release_properties_for_current_species(common_release_properties, taxonomy_id): - release_properties = {"taxonomy-id": taxonomy_id, - "species-release-folder": os.path.join(common_release_properties["release-folder"], - get_release_folder_name(taxonomy_id))} - os.makedirs(release_properties["species-release-folder"], exist_ok=True) - release_properties["timestamp"] = timestamp - release_properties["dump-dir"] = os.path.join(release_properties["species-release-folder"], "dumps") - os.makedirs(release_properties["dump-dir"], exist_ok=True) - return release_properties - - -def get_release_properties_for_current_assembly(species_release_properties, assembly_accession): - release_properties = { - "assembly-accession": assembly_accession, - "assembly-release-folder": os.path.join(species_release_properties["species-release-folder"], assembly_accession), - "release-log-file": "{species-release-folder}/{assembly_accession}/release_{taxonomy-id}_{assembly_accession}_{timestamp}.log".format(**species_release_properties, assembly_accession=assembly_accession) +logger = logging_config.getLogger(__name__) + + +def get_nextflow_config_param(taxonomy_id, assembly_accession, release_version): + dump_dir = os.path.join(get_species_release_folder(taxonomy_id), 'dumps') + release_dir = get_release_log_file_name(taxonomy_id, assembly_accession) + config_param = os.path.join(release_dir, f'nextflow_params_{taxonomy_id}_{assembly_accession}.yaml') + os.makedirs(dump_dir, exist_ok=True) + yaml_data = { + 'assembly': assembly_accession, + 'dump_dir': dump_dir, + 'executable': cfg['executable'], + # 'executable.bgzip': + # 'executable.convert_vcf_file': + # 'executable.count_ids_in_vcf': + # 'executable.sort_vcf_sorted_chromosomes': + # 'executable.vcf_assembly_checker': + # 'executable.vcf_validator': + 'jar': cfg['jar'], + 'log_file': get_release_log_file_name(taxonomy_id, assembly_accession), + 'maven': cfg['maven'], + 'python_script': cfg.query('python', 'interpreter'), + 'python_path': os.environ['PYTHONPATH'], + 'release_version': release_version, + 'assembly_folder': release_dir, + 'taxonomy': taxonomy_id } - os.makedirs(release_properties["assembly-release-folder"], exist_ok=True) - return release_properties - - -def get_nextflow_process_definition(assembly_release_properties, workflow_process_name, workflow_process_args, - process_name_suffix=None, label_time=None, label_memory=None): - if process_name_suffix is None: - process_name_suffix = assembly_release_properties["assembly-accession"].replace('.', '_') - release_properties = copy.deepcopy(assembly_release_properties) - release_properties["workflow-process-name"] = workflow_process_name + "_" + process_name_suffix - - release_properties["process-with-args"] = "{0} {1}".format(workflow_process_name, - " ".join(["--{0} {1}" - .format(arg, release_properties[arg]) - for arg in workflow_process_args])) - release_properties['label-time'] = label_time or 'long_time' - release_properties['label-memory'] = label_memory or 'med_mem' - return workflow_process_template_for_nextflow.format(**release_properties) - - -def prepare_release_workflow_file_for_species(common_release_properties, taxonomy_id, assembly_accession): - process_index = 1 - release_properties = merge_two_dicts(common_release_properties, - get_release_properties_for_current_species(common_release_properties, - taxonomy_id)) - release_assembly_properties = merge_two_dicts(release_properties, - get_release_properties_for_current_assembly(release_properties, - assembly_accession)) - # This hack is needed to kick-off the initial process in Nextflow - release_assembly_properties["previous-process-output-flag"] = "true" - release_assembly_properties["current-process-output-flag"] = "flag" + str(process_index) - # Ensure that PYTHONPATH is properly set so that scripts can be run - # as "python3 -m run_release_in_embassy." - release_assembly_properties["script-path"] = os.environ["PYTHONPATH"] - workflow_file_name = os.path.join( - release_assembly_properties["species-release-folder"], release_assembly_properties["assembly-accession"], - "{taxonomy-id}_{assembly-accession}_release_workflow_{timestamp}.nf".format(**release_assembly_properties) - ) - - with open(workflow_file_name, "w") as workflow_file_handle: - header = "#!/usr/bin/env nextflow" - workflow_file_handle.write(header + "\n") - - for workflow_process_name, workflow_process_info in workflow_process_arguments_map.items(): - workflow_process_args = workflow_process_info[0] - label_time = label_memory = None - if len(workflow_process_info) > 1: - label_time = workflow_process_info[1] - if len(workflow_process_info) > 2: - label_memory = workflow_process_info[2] - release_assembly_properties["current-process-output-flag"] = "flag" + str(process_index) - - workflow_file_handle.write( - get_nextflow_process_definition( - release_assembly_properties, - workflow_process_name, - workflow_process_args, - process_name_suffix=assembly_accession.replace(".", "_"), - label_time=label_time, - label_memory=label_memory - ) - ) - workflow_file_handle.write("\n") - process_index += 1 - # Set the flag that will capture the output status of the current process - # This variable will be used to decide whether the next process should be started - # See http://nextflow-io.github.io/patterns/index.html#_mock_dependency - release_assembly_properties["previous-process-output-flag"] = "flag" + str(process_index - 1) - return workflow_file_name, release_assembly_properties["release-log-file"] + with open(config_param, 'w') as open_file: + yaml.safe_dump(yaml_data, open_file) + return config_param + + +def get_run_release_for_assembly_nextflow(): + curr_dir = os.path.dirname(__file__) + return os.path.join(curr_dir, 'run_release_for_assembly.nf') + + +def get_release_log_file_name(taxonomy_id, assembly_accession): + return f"{cfg['species-release-folder']}/{assembly_accession}/release_{taxonomy_id}_{assembly_accession}.log" + +def get_species_release_folder(taxonomy_id): + return os.path.join(cfg["release-folder"], get_release_folder_name(taxonomy_id)) -def get_common_release_properties(common_release_properties_file): - return yaml.load(open(common_release_properties_file), Loader=yaml.FullLoader) +def get_assembly_release_folder(taxonomy_id, assembly_accession): + return os.path.join(get_species_release_folder(taxonomy_id), assembly_accession) -def run_release_for_species(common_release_properties_file, taxonomy_id, release_assemblies): - common_release_properties = get_common_release_properties(common_release_properties_file) - private_config_xml_file = common_release_properties["private-config-xml-file"] - profile = common_release_properties["profile"] - release_species_inventory_table = common_release_properties["release-species-inventory-table"] - release_version = common_release_properties["release-version"] + +def run_release_for_species(taxonomy_id, release_assemblies, release_version): + private_config_xml_file = cfg.query("maven", "settings_file") + profile = cfg.query("maven", "environment") + release_species_inventory_table = cfg["release-species-inventory-table"] with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle: if not release_assemblies: - release_assemblies = get_release_assemblies_for_taxonomy(taxonomy_id, release_species_inventory_table, - release_version, metadata_connection_handle) + release_assemblies = get_release_assemblies_for_taxonomy( + taxonomy_id, release_species_inventory_table, release_version, metadata_connection_handle + ) + for assembly_accession in release_assemblies: - workflow_file_name, release_log_file = prepare_release_workflow_file_for_species(common_release_properties, - taxonomy_id, - assembly_accession) - workflow_report_file_name = workflow_file_name.replace(".nf", ".report.html") - if os.path.exists(workflow_report_file_name): - os.remove(workflow_report_file_name) - workflow_command = "cd {0} && {1} run {2} -c {3} -with-report {4} -bg".format( - os.path.dirname(release_log_file), - common_release_properties["nextflow-binary-path"], workflow_file_name, - common_release_properties["nextflow-config-path"], workflow_report_file_name) - logger.info("Check log file in: " + release_log_file + " to monitor progress...") - logger.info("Running workflow file {0} with the following command:\n\n {1} \n\n" - "Use the above command with -resume if this workflow needs to be resumed in the future" - .format(workflow_file_name, workflow_command)) - os.system(workflow_command) + nextflow_params = get_nextflow_config_param(taxonomy_id, assembly_accession, release_version) + workflow_file_path = get_run_release_for_assembly_nextflow() + release_dir = get_assembly_release_folder(taxonomy_id, assembly_accession) + workflow_command = ( + f"cd {release_dir} && " + f"{cfg.query('executable', 'nextflow')} run {workflow_file_path} " + f"-params-file {nextflow_params}" + ) + logger.info(f"Running workflow file {workflow_file_path} with the following command: " + f"\n {workflow_command} \n") + run_command_with_output(f"Running workflow file {workflow_file_path}", workflow_command) + + +def load_config(config_file): + cfg.load_config_file(config_file) def main(): @@ -232,8 +106,14 @@ def main(): argparse.add_argument("--common-release-properties-file", help="ex: /path/to/release/properties.yml", required=True) argparse.add_argument("--taxonomy-id", help="ex: 9913", required=True) argparse.add_argument("--assembly-accessions", nargs='+', help="ex: GCA_000003055.3") + argparse.add_argument("--release_version", required=True) args = argparse.parse_args() - run_release_for_species(args.common_release_properties_file, args.taxonomy_id, args.assembly_accessions) + + logging_config.add_stdout_handler() + + load_config(args.common_release_properties_file) + + run_release_for_species(args.taxonomy_id, args.assembly_accessions, args.release_version) if __name__ == "__main__": From cca7c1dd5eb0f0eeaca6293b65f1f86fe61e1951 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 9 May 2024 11:01:28 +0100 Subject: [PATCH 5/8] Allow a configs to be passed from a variables or command line --- .../run_release_for_species.py | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index ece259e4f..8ca5a30f9 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -27,7 +27,7 @@ logger = logging_config.getLogger(__name__) -def get_nextflow_config_param(taxonomy_id, assembly_accession, release_version): +def get_nextflow_params(taxonomy_id, assembly_accession, release_version): dump_dir = os.path.join(get_species_release_folder(taxonomy_id), 'dumps') release_dir = get_release_log_file_name(taxonomy_id, assembly_accession) config_param = os.path.join(release_dir, f'nextflow_params_{taxonomy_id}_{assembly_accession}.yaml') @@ -36,12 +36,6 @@ def get_nextflow_config_param(taxonomy_id, assembly_accession, release_version): 'assembly': assembly_accession, 'dump_dir': dump_dir, 'executable': cfg['executable'], - # 'executable.bgzip': - # 'executable.convert_vcf_file': - # 'executable.count_ids_in_vcf': - # 'executable.sort_vcf_sorted_chromosomes': - # 'executable.vcf_assembly_checker': - # 'executable.vcf_validator': 'jar': cfg['jar'], 'log_file': get_release_log_file_name(taxonomy_id, assembly_accession), 'maven': cfg['maven'], @@ -56,6 +50,11 @@ def get_nextflow_config_param(taxonomy_id, assembly_accession, release_version): return config_param +def get_nextflow_config(): + if 'RELEASE_NEXTFLOW_CONFIG' in os.environ and os.path.isfile(os.environ['RELEASE_NEXTFLOW_CONFIG']): + return os.environ['RELEASE_NEXTFLOW_CONFIG'] + + def get_run_release_for_assembly_nextflow(): curr_dir = os.path.dirname(__file__) return os.path.join(curr_dir, 'run_release_for_assembly.nf') @@ -73,10 +72,10 @@ def get_assembly_release_folder(taxonomy_id, assembly_accession): return os.path.join(get_species_release_folder(taxonomy_id), assembly_accession) -def run_release_for_species(taxonomy_id, release_assemblies, release_version): +def run_release_for_species(taxonomy_id, release_assemblies, release_version, resume=False): private_config_xml_file = cfg.query("maven", "settings_file") profile = cfg.query("maven", "environment") - release_species_inventory_table = cfg["release-species-inventory-table"] + release_species_inventory_table = cfg.query('release', 'inventory_table') with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle: if not release_assemblies: release_assemblies = get_release_assemblies_for_taxonomy( @@ -84,36 +83,48 @@ def run_release_for_species(taxonomy_id, release_assemblies, release_version): ) for assembly_accession in release_assemblies: - nextflow_params = get_nextflow_config_param(taxonomy_id, assembly_accession, release_version) + nextflow_params = get_nextflow_params(taxonomy_id, assembly_accession, release_version) workflow_file_path = get_run_release_for_assembly_nextflow() release_dir = get_assembly_release_folder(taxonomy_id, assembly_accession) - workflow_command = ( + nextflow_config = get_nextflow_config() + workflow_command = ' '.join(( f"cd {release_dir} && " f"{cfg.query('executable', 'nextflow')} run {workflow_file_path} " - f"-params-file {nextflow_params}" - ) + f"-params-file {nextflow_params} " + f'-c {nextflow_config}' if nextflow_config else '' + '-resume' if resume else '' + )) logger.info(f"Running workflow file {workflow_file_path} with the following command: " f"\n {workflow_command} \n") run_command_with_output(f"Running workflow file {workflow_file_path}", workflow_command) -def load_config(config_file): - cfg.load_config_file(config_file) +def load_config(*args): + cfg.load_config_file( + *args, + os.environ.get('RELEASE_CONFIG'), + '~/.release_config.yml' + ) def main(): argparse = ArgumentParser() - argparse.add_argument("--common-release-properties-file", help="ex: /path/to/release/properties.yml", required=True) argparse.add_argument("--taxonomy-id", help="ex: 9913", required=True) argparse.add_argument("--assembly-accessions", nargs='+', help="ex: GCA_000003055.3") argparse.add_argument("--release_version", required=True) + argparse.add_argument("--resume", default=False, required=False, + help="Resume the nextflow pipeline for the specified taxonomy and assembly") + argparse.add_argument("--release_config-properties-file", + help="Path to the release configuration file. That will override the config specified with " + "RELEASE_CONFIG variable or placed in ~/.release_config.yml.", + required=False) args = argparse.parse_args() logging_config.add_stdout_handler() load_config(args.common_release_properties_file) - run_release_for_species(args.taxonomy_id, args.assembly_accessions, args.release_version) + run_release_for_species(args.taxonomy_id, args.assembly_accessions, args.release_version, args.resume) if __name__ == "__main__": From 67e91dbe7169c78557da5d4cc44dbcec58ee5524 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 9 May 2024 11:05:30 +0100 Subject: [PATCH 6/8] Change property name of release output folder --- .../run_release_in_embassy/run_release_for_species.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index 8ca5a30f9..f3c41a8c8 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -65,7 +65,7 @@ def get_release_log_file_name(taxonomy_id, assembly_accession): def get_species_release_folder(taxonomy_id): - return os.path.join(cfg["release-folder"], get_release_folder_name(taxonomy_id)) + return os.path.join(cfg["release_output"], get_release_folder_name(taxonomy_id)) def get_assembly_release_folder(taxonomy_id, assembly_accession): From d556d086e4c05952344f08cff2da53ea4c376770 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 9 May 2024 12:09:13 +0100 Subject: [PATCH 7/8] Couple of bug fixes and sync with config file --- .../run_release_in_embassy/__init__.py | 25 ------------- .../release_common_utils.py | 5 ++- .../run_release_for_assembly.nf | 24 ++++++------ .../run_release_for_assembly.py | 5 ++- .../run_release_for_species.py | 37 +++++++++++-------- 5 files changed, 40 insertions(+), 56 deletions(-) diff --git a/eva-accession-release-automation/run_release_in_embassy/__init__.py b/eva-accession-release-automation/run_release_in_embassy/__init__.py index 00e90d898..e69de29bb 100644 --- a/eva-accession-release-automation/run_release_in_embassy/__init__.py +++ b/eva-accession-release-automation/run_release_in_embassy/__init__.py @@ -1,25 +0,0 @@ -# Copyright 2020 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import sys - - -def init_logger(): - logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s') - result_logger = logging.getLogger(__name__) - return result_logger - - -logger = init_logger() diff --git a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py index 608080afe..58829a864 100644 --- a/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py +++ b/eva-accession-release-automation/run_release_in_embassy/release_common_utils.py @@ -12,18 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import os import signal import traceback from functools import lru_cache +from ebi_eva_common_pyutils.logger import logging_config + from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle from ebi_eva_common_pyutils.network_utils import get_available_local_port, forward_remote_port_to_local_port from ebi_eva_common_pyutils.taxonomy import taxonomy -logger = logging.getLogger(__name__) +logger = logging_config.get_logger(__name__) def open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, assembly, diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf index 676aa3016..12a524d7a 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl=2 workflow { - initiate_release_status_for_assembly | copy_accessioning_collections_to_embassy | run_release_for_assembly | \ + initiate_release_status_for_assembly('initiate') | copy_accessioning_collections_to_embassy | run_release_for_assembly | \ merge_dbsnp_eva_release_files | sort_bgzip_index_release_files | validate_release_vcf_files | \ analyze_vcf_validation_results | count_rs_ids_in_release_files | validate_rs_release_files | \ update_sequence_names_to_ena | update_release_status_for_assembly @@ -20,7 +20,7 @@ process initiate_release_status_for_assembly { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.initiate_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.initiate_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 """ } @@ -35,7 +35,7 @@ process copy_accessioning_collections_to_embassy { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.copy_accessioning_collections_to_embassy --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --dump-dir $params.dump_dir 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.copy_accessioning_collections_to_embassy --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --dump-dir $params.dump_dir 1>> $params.log_file 2>&1 """ } @@ -50,7 +50,7 @@ process run_release_for_assembly { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1 """ } @@ -65,7 +65,7 @@ process merge_dbsnp_eva_release_files { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -80,7 +80,7 @@ process sort_bgzip_index_release_files { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -95,7 +95,7 @@ process validate_release_vcf_files { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1 """ } @@ -110,7 +110,7 @@ process analyze_vcf_validation_results { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1 """ } @@ -125,7 +125,7 @@ process count_rs_ids_in_release_files { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -140,7 +140,7 @@ process validate_rs_release_files { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1 """ } @@ -155,7 +155,7 @@ process update_sequence_names_to_ena { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1 """ } @@ -170,6 +170,6 @@ process update_release_status_for_assembly { script: """ export PYTHONPATH=$params.python_path - $params.python_script -m run_release_in_embassy.update_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 + $params.executable.python.interpreter -m run_release_in_embassy.update_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1 """ } diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.py index 630f5ec92..f6c6b326c 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.py @@ -13,16 +13,17 @@ # limitations under the License. import click -import logging import sys import traceback +from ebi_eva_common_pyutils.logger import logging_config + from run_release_in_embassy.create_release_properties_file import create_release_properties_file_for_assembly from run_release_in_embassy.release_common_utils import open_mongo_port_to_tempmongo, close_mongo_port_to_tempmongo from ebi_eva_common_pyutils.command_utils import run_command_with_output -logger = logging.getLogger(__name__) +logger = logging_config.get_logger(__name__) def run_release_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession, diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index f3c41a8c8..531177120 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -14,6 +14,7 @@ from argparse import ArgumentParser import os +from functools import lru_cache import yaml from ebi_eva_common_pyutils.command_utils import run_command_with_output @@ -24,12 +25,12 @@ from run_release_in_embassy.release_common_utils import get_release_folder_name -logger = logging_config.getLogger(__name__) +logger = logging_config.get_logger(__name__) def get_nextflow_params(taxonomy_id, assembly_accession, release_version): dump_dir = os.path.join(get_species_release_folder(taxonomy_id), 'dumps') - release_dir = get_release_log_file_name(taxonomy_id, assembly_accession) + release_dir = get_assembly_release_folder(taxonomy_id, assembly_accession) config_param = os.path.join(release_dir, f'nextflow_params_{taxonomy_id}_{assembly_accession}.yaml') os.makedirs(dump_dir, exist_ok=True) yaml_data = { @@ -52,7 +53,7 @@ def get_nextflow_params(taxonomy_id, assembly_accession, release_version): def get_nextflow_config(): if 'RELEASE_NEXTFLOW_CONFIG' in os.environ and os.path.isfile(os.environ['RELEASE_NEXTFLOW_CONFIG']): - return os.environ['RELEASE_NEXTFLOW_CONFIG'] + return os.path.abspath(os.environ['RELEASE_NEXTFLOW_CONFIG']) def get_run_release_for_assembly_nextflow(): @@ -61,15 +62,21 @@ def get_run_release_for_assembly_nextflow(): def get_release_log_file_name(taxonomy_id, assembly_accession): - return f"{cfg['species-release-folder']}/{assembly_accession}/release_{taxonomy_id}_{assembly_accession}.log" + return f"{get_assembly_release_folder(taxonomy_id, assembly_accession)}/release_{taxonomy_id}_{assembly_accession}.log" +@lru_cache def get_species_release_folder(taxonomy_id): - return os.path.join(cfg["release_output"], get_release_folder_name(taxonomy_id)) + folder = os.path.join(cfg.query('release', 'release_output'), get_release_folder_name(taxonomy_id)) + os.makedirs(folder, exist_ok=True) + return folder +@lru_cache def get_assembly_release_folder(taxonomy_id, assembly_accession): - return os.path.join(get_species_release_folder(taxonomy_id), assembly_accession) + folder = os.path.join(get_species_release_folder(taxonomy_id), assembly_accession) + os.makedirs(folder, exist_ok=True) + return folder def run_release_for_species(taxonomy_id, release_assemblies, release_version, resume=False): @@ -88,11 +95,11 @@ def run_release_for_species(taxonomy_id, release_assemblies, release_version, re release_dir = get_assembly_release_folder(taxonomy_id, assembly_accession) nextflow_config = get_nextflow_config() workflow_command = ' '.join(( - f"cd {release_dir} && " - f"{cfg.query('executable', 'nextflow')} run {workflow_file_path} " - f"-params-file {nextflow_params} " - f'-c {nextflow_config}' if nextflow_config else '' - '-resume' if resume else '' + f"cd {release_dir} &&", + f"{cfg.query('executable', 'nextflow')} run {workflow_file_path}", + f"-params-file {nextflow_params}", + f'-c {nextflow_config}' if nextflow_config else '', + '-resume' if resume else '', )) logger.info(f"Running workflow file {workflow_file_path} with the following command: " f"\n {workflow_command} \n") @@ -109,12 +116,12 @@ def load_config(*args): def main(): argparse = ArgumentParser() - argparse.add_argument("--taxonomy-id", help="ex: 9913", required=True) - argparse.add_argument("--assembly-accessions", nargs='+', help="ex: GCA_000003055.3") + argparse.add_argument("--taxonomy_id", help="ex: 9913", required=True) + argparse.add_argument("--assembly_accessions", nargs='+', help="ex: GCA_000003055.3") argparse.add_argument("--release_version", required=True) argparse.add_argument("--resume", default=False, required=False, help="Resume the nextflow pipeline for the specified taxonomy and assembly") - argparse.add_argument("--release_config-properties-file", + argparse.add_argument("--release_config_file", help="Path to the release configuration file. That will override the config specified with " "RELEASE_CONFIG variable or placed in ~/.release_config.yml.", required=False) @@ -122,7 +129,7 @@ def main(): logging_config.add_stdout_handler() - load_config(args.common_release_properties_file) + load_config(args.release_config_file) run_release_for_species(args.taxonomy_id, args.assembly_accessions, args.release_version, args.resume) From 14557c8f3ce5d7ec38a968e72a53f36cf31fe129 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 10 May 2024 16:13:02 +0100 Subject: [PATCH 8/8] address review comments --- eva-accession-release-automation/README.md | 1 - .../run_release_for_assembly.nf | 66 ++++++++++++------- .../run_release_for_species.py | 9 +-- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/eva-accession-release-automation/README.md b/eva-accession-release-automation/README.md index 1c2dedd8a..c3cd0b7a5 100644 --- a/eva-accession-release-automation/README.md +++ b/eva-accession-release-automation/README.md @@ -9,7 +9,6 @@ pip install -e /path/to/eva-accession/eva-accession-release-automation The release automation script `run_release_for_species.py` has the following parameters: * **common-release-properties-file:** Path to yaml config file, see below * **taxonomy-id:** Taxonomy to release -* **memory:** Amount of memory to use when running the release jobs You also need to set `PYTHONPATH=/path/to/eva-accession/eva-accession-release-automation/run_release_in_embassy`. diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf index 12a524d7a..c294f025b 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_assembly.nf @@ -14,9 +14,11 @@ process initiate_release_status_for_assembly { label 'short_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -29,9 +31,11 @@ process copy_accessioning_collections_to_embassy { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -44,9 +48,11 @@ process run_release_for_assembly { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -59,9 +65,11 @@ process merge_dbsnp_eva_release_files { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -74,9 +82,11 @@ process sort_bgzip_index_release_files { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -89,9 +99,11 @@ process validate_release_vcf_files { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -104,9 +116,11 @@ process analyze_vcf_validation_results { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -119,9 +133,11 @@ process count_rs_ids_in_release_files { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -134,9 +150,11 @@ process validate_rs_release_files { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -149,9 +167,11 @@ process update_sequence_names_to_ena { label 'long_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag + val true, emit: flag + script: """ export PYTHONPATH=$params.python_path @@ -164,9 +184,11 @@ process update_release_status_for_assembly { label 'short_time', 'med_mem' input: - val flag + val flag + output: - val true, emit: flag11 + val true, emit: flag11 + script: """ export PYTHONPATH=$params.python_path diff --git a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py index 531177120..51dc63de7 100644 --- a/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py +++ b/eva-accession-release-automation/run_release_in_embassy/run_release_for_species.py @@ -40,7 +40,6 @@ def get_nextflow_params(taxonomy_id, assembly_accession, release_version): 'jar': cfg['jar'], 'log_file': get_release_log_file_name(taxonomy_id, assembly_accession), 'maven': cfg['maven'], - 'python_script': cfg.query('python', 'interpreter'), 'python_path': os.environ['PYTHONPATH'], 'release_version': release_version, 'assembly_folder': release_dir, @@ -121,16 +120,10 @@ def main(): argparse.add_argument("--release_version", required=True) argparse.add_argument("--resume", default=False, required=False, help="Resume the nextflow pipeline for the specified taxonomy and assembly") - argparse.add_argument("--release_config_file", - help="Path to the release configuration file. That will override the config specified with " - "RELEASE_CONFIG variable or placed in ~/.release_config.yml.", - required=False) args = argparse.parse_args() - + load_config() logging_config.add_stdout_handler() - load_config(args.release_config_file) - run_release_for_species(args.taxonomy_id, args.assembly_accessions, args.release_version, args.resume)