From 763838278d3ede4edc5b0d30f7deb251369ebcd5 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 22 Mar 2024 13:19:31 +0000 Subject: [PATCH] move create progress table to release automation, delete clustering automation folder --- eva-accession-clustering-automation/README.md | 36 ---- .../clustering_automation/__init__.py | 0 .../cluster_from_mongo.py | 147 -------------- .../clustering_automation/cluster_from_vcf.py | 125 ------------ .../create_clustering_properties.py | 189 ------------------ .../update_clustering_status.py | 58 ------ .../requirements.txt | 2 - eva-accession-clustering-automation/setup.py | 19 -- .../create_clustering_progress_table.py | 0 9 files changed, 576 deletions(-) delete mode 100644 eva-accession-clustering-automation/README.md delete mode 100644 eva-accession-clustering-automation/clustering_automation/__init__.py delete mode 100644 eva-accession-clustering-automation/clustering_automation/cluster_from_mongo.py delete mode 100644 eva-accession-clustering-automation/clustering_automation/cluster_from_vcf.py delete mode 100644 eva-accession-clustering-automation/clustering_automation/create_clustering_properties.py delete mode 100644 eva-accession-clustering-automation/clustering_automation/update_clustering_status.py delete mode 100644 eva-accession-clustering-automation/requirements.txt delete mode 100644 eva-accession-clustering-automation/setup.py rename {eva-accession-clustering-automation/clustering_automation => eva-accession-release-automation}/create_clustering_progress_table.py (100%) diff --git a/eva-accession-clustering-automation/README.md b/eva-accession-clustering-automation/README.md deleted file mode 100644 index b97f48673..000000000 --- a/eva-accession-clustering-automation/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Pre-requisites -* Install the **ebi_eva_common_pyutils** module in your local python environment - ```bash - pip3 install -r requirements.txt - ``` - -# Usage -## Cluster multiple assemblies -The clustering automation script have the following parameters: -* **source\*:** The possible sources are Mongo or VCF -* **asm-vcf-prj-list:** Is a list of one or many assembly#vcf#project combinations. This is required if the source is VCF -* **assembly-list:** Is a list of assemblies to process. This is required if the source is Mongo -* **private-config-xml-file\*:** Maven settings.xml file with the profiles that hold database connection data -* **profile\*:** Profile to run the pipeline. e.g. production -* **output-directory:** Directory where the generated files will be stored -* **logs-directory:** Directory where the logs will be stored -* **clustering-artifact\*:** Clustering artifact path is the latest version of the clustering pipeline -* **only-printing:** Is a flag to only get the commands but not run them -* **memory:** Amount of memory to use when running the clustering jobs - - -## Examples -* Example using Mongo as source - ```bash - python3 path/to/eva-accession/eva-accession-clustering-automation/cluster_multiple_assemblies.py --source MONGO --assembly-list GCA_000233375.4 GCA_000002285.2 --output-directory /output/clustering_automation --logs-directory /output/logs --only-printing --clustering-artifact cluster.jar --profile production --private-config-xml-file /configuration/eva-maven-settings.xml - ``` - -* Example using VCF as source - ```bash - python3 path/to/eva-accession/eva-accession-clustering-automation/cluster_multiple_assemblies.py --source VCF --asm-vcf-prj-list GCA_000233375.4#/nfs/eva/accessioned.vcf.gz#PRJEB1111 GCA_000002285.2#/nfs/eva/file.vcf.gz#PRJEB2222 --output-directory /output/clustering_automation --logs-directory /output/logs --only-printing --clustering-artifact cluster.jar --profile production --private-config-xml-file /configuration/eva-maven-settings.xml - ``` - - -## Notes -* The **settings xml file** should be passed using the parameter --private-config-xml-file. If it is being run from the - EBI cluster deploy the configuration repository and point to the eva settings xml file. \ No newline at end of file diff --git a/eva-accession-clustering-automation/clustering_automation/__init__.py b/eva-accession-clustering-automation/clustering_automation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/eva-accession-clustering-automation/clustering_automation/cluster_from_mongo.py b/eva-accession-clustering-automation/clustering_automation/cluster_from_mongo.py deleted file mode 100644 index 3e992ada9..000000000 --- a/eva-accession-clustering-automation/clustering_automation/cluster_from_mongo.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2020 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -import sys -import logging -import datetime -import yaml - -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.nextflow import LinearNextFlowPipeline, NextFlowProcess -from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query - -from clustering_automation.create_clustering_properties import create_properties_file -from ebi_eva_common_pyutils.taxonomy.taxonomy import normalise_taxon_scientific_name - -logger = logging.getLogger(__name__) -timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - - -def get_assemblies_and_scientific_name_from_taxonomy(taxonomy_id, metadata_connection_handle, clustering_tracking_table, release_version): - query = (f"SELECT assembly_accession, scientific_name FROM {clustering_tracking_table} " - f"WHERE taxonomy = '{taxonomy_id}' " - f"and release_version = {release_version} " - f"and assembly_accession <> 'Unmapped' " - f"and should_be_clustered = 't'") - results = get_all_results_for_query(metadata_connection_handle, query) - if len(results) == 0: - raise Exception("Could not find assemblies pertaining to taxonomy ID: " + taxonomy_id) - return [result[0] for result in results], results[0][1] - - -def get_common_clustering_properties(common_clustering_properties_file): - return yaml.load(open(common_clustering_properties_file), Loader=yaml.FullLoader) - - -def generate_linear_pipeline(taxonomy_id, scientific_name, assembly_list, common_properties, memory, instance, enable_retryable): - private_config_xml_file = common_properties["private-config-xml-file"] - profile = common_properties["profile"] - clustering_artifact = common_properties["clustering-jar-path"] - python = common_properties["python3-path"] - release_version = common_properties['release-version'] - clustering_folder = common_properties['clustering-folder'] - clustering_tracking_table = common_properties['clustering-release-tracker'] - - pipeline = LinearNextFlowPipeline() - species_directory = os.path.join(clustering_folder, f"{normalise_taxon_scientific_name(scientific_name)}_{taxonomy_id}") - for assembly in assembly_list: - output_directory = os.path.join(species_directory, assembly) - os.makedirs(output_directory, exist_ok=True) - properties_path = create_properties_file('MONGO', None, None, assembly, - private_config_xml_file, profile, output_directory, instance, enable_retryable) - status_update_template = (f'{python} -m clustering_automation.update_clustering_status ' - f'--private-config-xml-file {private_config_xml_file} ' - f'--clustering-tracking-table {clustering_tracking_table} ' - f'--release {release_version} ' - f'--assembly {assembly} ' - f'--taxonomy {taxonomy_id} ' - '--status {status}') # will be filled in later - - suffix = assembly.replace('.', '_') - pipeline.add_process( - process_name=f'start_{suffix}', - command_to_run=status_update_template.format(status='Started'), - ) - - process_directives_for_java_pipelines = {'memory': f'{memory} MB', - 'clusterOptions': (f'-o {output_directory}/cluster_{timestamp}.log ' - f'-e {output_directory}/cluster_{timestamp}.err')} - # Refer to ProcessRemappedVariantsWithRSJobConfiguration.java and ClusterUnclusteredVariantsJobConfiguration.java - # for descriptions and rationale for 2 separate jobs - # Access to internal method _add_new_process needed for process_directives - pipeline._add_new_process(NextFlowProcess( - process_name=f'process_remapped_variants_with_rs_{suffix}', - command_to_run=f'java -Xmx{memory}m -jar {clustering_artifact} --spring.config.location=file:{properties_path} ' - f'--spring.batch.job.names=PROCESS_REMAPPED_VARIANTS_WITH_RS_JOB', - process_directives='clusterOptions': f"{process_directives_for_java_pipelines['clusterOptions']}" - f" -g /accession "} # Limits the overall number of jobs using job tracker - )) - pipeline._add_new_process(NextFlowProcess( - process_name=f'cluster_{suffix}', - command_to_run=f'java -Xmx{memory}m -jar {clustering_artifact} --spring.config.location=file:{properties_path} ' - f'--spring.batch.job.names=CLUSTER_UNCLUSTERED_VARIANTS_JOB', - process_directives={'memory': process_directives_for_java_pipelines['memory'], - 'clusterOptions': f"{process_directives_for_java_pipelines['clusterOptions']}" - f" -g /accession/instance-{instance} "} # needed to serialize accessioning - )) - pipeline.add_process( - process_name=f'end_{suffix}', - command_to_run=status_update_template.format(status='Completed') # TODO: how to choose completed/failed? - ) - # TODO add QA process - return pipeline, species_directory - - -def cluster_multiple_from_mongo(taxonomy_id, common_clustering_properties_file, memory, instance, enable_retryable): - """ - Generates and runs a Nextflow pipeline to cluster all assemblies for a given taxonomy. - """ - common_properties = get_common_clustering_properties(common_clustering_properties_file) - clustering_tracking_table = common_properties["clustering-release-tracker"] - release_version = common_properties["release-version"] - clustering_folder = common_properties['clustering-folder'] - with get_metadata_connection_handle("production_processing", common_properties["private-config-xml-file"]) as metadata_connection_handle: - assembly_list, scientific_name = get_assemblies_and_scientific_name_from_taxonomy(taxonomy_id, metadata_connection_handle, clustering_tracking_table, release_version) - pipeline, output_directory = generate_linear_pipeline(taxonomy_id, scientific_name, assembly_list, common_properties, memory, instance, enable_retryable) - pipeline.run_pipeline( - workflow_file_path=os.path.join(clustering_folder, f'{taxonomy_id}_clustering_workflow_{timestamp}.nf'), - nextflow_binary_path=common_properties['nextflow-binary-path'], - nextflow_config_path=common_properties['nextflow-config-path'], - working_dir=output_directory - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Cluster multiple assemblies', add_help=False) - parser.add_argument("--taxonomy-id", help="Taxonomy id", required=True) - parser.add_argument("--common-clustering-properties-file", help="ex: /path/to/clustering/properties.yml", required=True) - parser.add_argument("--memory", help="Amount of memory jobs will use", required=False, default=8192) - parser.add_argument("--instance", help="Accessioning instance id", required=False, default=6, - type=int, choices=range(1, 13)) - parser.add_argument("--enable-retryable", help="Set the clustering to use the retryable reader", default=False, - action='store_true') - parser.add_argument('--help', action='help', help='Show this help message and exit') - - args = {} - try: - args = parser.parse_args() - cluster_multiple_from_mongo(args.taxonomy_id, args.common_clustering_properties_file, args.memory, - args.instance, args.enable_retryable) - except Exception as ex: - logger.exception(ex) - sys.exit(1) - - sys.exit(0) diff --git a/eva-accession-clustering-automation/clustering_automation/cluster_from_vcf.py b/eva-accession-clustering-automation/clustering_automation/cluster_from_vcf.py deleted file mode 100644 index b4fd117e5..000000000 --- a/eva-accession-clustering-automation/clustering_automation/cluster_from_vcf.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright 2020 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -import sys -import logging -import datetime -from ebi_eva_common_pyutils.command_utils import run_command_with_output - -from create_clustering_properties import create_properties_file - - -logger = logging.getLogger(__name__) -timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - - -def generate_bsub_command(assembly_accession, properties_path, logs_directory, clustering_artifact, memory, dependency): - job_name = get_job_name(assembly_accession) - log_file = '{assembly_accession}_cluster_{timestamp}.log'.format(assembly_accession=assembly_accession, - timestamp=timestamp) - error_file = '{assembly_accession}_cluster_{timestamp}.err'.format(assembly_accession=assembly_accession, - timestamp=timestamp) - if logs_directory: - log_file = os.path.join(logs_directory, log_file) - error_file = os.path.join(logs_directory, error_file) - - memory_amount = 8192 - if memory: - memory_amount = memory - - dependency_param = '' - if dependency: - dependency_param = '-w {dependency} '.format(dependency=dependency) - - command = 'bsub {dependency_param}-J {job_name} -o {log_file} -e {error_file} -M {memory_amount} ' \ - '-R "rusage[mem={memory_amount}]" java -jar {clustering_artifact} ' \ - '--spring.config.location=file:{properties_path} --spring.batch.job.names=CLUSTERING_FROM_VCF_JOB'\ - .format(dependency_param=dependency_param, job_name=job_name, log_file=log_file, error_file=error_file, - memory_amount=memory_amount, clustering_artifact=clustering_artifact, properties_path=properties_path) - - print(command) - add_to_command_file(properties_path, command) - return command - - -def get_job_name(assembly_accession): - return '{timestamp}_cluster_{assembly_accession}'.format(assembly_accession=assembly_accession, timestamp=timestamp) - - -def add_to_command_file(properties_path, command): - """ - This method writes the commands to a text file in the output folder - """ - commands_path = os.path.dirname(properties_path) + '/commands_' + timestamp + '.txt' - with open(commands_path, 'a+') as commands: - commands.write(command + '\n') - - -def cluster_one(vcf_file, project_accession, assembly_accession, private_config_xml_file, profile, - output_directory, logs_directory, clustering_artifact, only_printing, memory, instance, dependency): - properties_path = create_properties_file('VCF', vcf_file, project_accession, assembly_accession, - private_config_xml_file, profile, output_directory, instance) - command = generate_bsub_command(assembly_accession, properties_path, logs_directory, clustering_artifact, memory, - dependency) - if not only_printing: - run_command_with_output('Run clustering command', command, return_process_output=True) - - -def cluster_multiple_from_vcf(asm_vcf_prj_list, private_config_xml_file, profile, - output_directory, logs_directory, clustering_artifact, only_printing, memory, instance): - """ - The list will be of the form: GCA_000000001.1#/file1.vcf.gz#PRJEB1111 GCA_000000002.2#/file2.vcf.gz#PRJEB2222 ... - This method splits the triplets and then call the run_clustering method for each one - """ - dependency = None - for triplet in asm_vcf_prj_list: - data = triplet.split('#') - assembly_accession = data[0] - vcf_file = data[1] - project_accession = data[2] - cluster_one(vcf_file, project_accession, assembly_accession, private_config_xml_file, profile, - output_directory, logs_directory, clustering_artifact, only_printing, memory, instance, dependency) - dependency = get_job_name(assembly_accession) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Cluster multiple assemblies', add_help=False) - parser.add_argument("--asm-vcf-prj-list", help="List of Assembly, VCF, project to be clustered, " - "e.g. GCA_000233375.4#/nfs/eva/accessioned.vcf.gz#PRJEB1111 " - "GCA_000002285.2#/nfs/eva/file.vcf.gz#PRJEB2222. " - "Required when the source is VCF", required=True, nargs='+') - parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) - parser.add_argument("--profile", help="Profile to get the properties, e.g.production", required=True) - parser.add_argument("--output-directory", help="Output directory for the properties file", required=False) - parser.add_argument("--logs-directory", help="Directory for logs files", required=False) - parser.add_argument("--clustering-artifact", help="Artifact of the clustering pipeline", required=True) - parser.add_argument("--only-printing", help="Prepare and write the commands, but don't run them", - action='store_true', required=False) - parser.add_argument("--memory", help="Amount of memory jobs will use", required=False, default=8192) - parser.add_argument("--instance", help="Accessioning instance id", required=False, default=1, choices=range(1, 13)) - parser.add_argument('--help', action='help', help='Show this help message and exit') - - args = {} - try: - args = parser.parse_args() - cluster_multiple_from_vcf(args.asm_vcf_prj_list, args.private_config_xml_file, - args.profile, args.output_directory, args.logs_directory, args.clustering_artifact, - args.only_printing, args.memory, args.instance) - except Exception as ex: - logger.exception(ex) - sys.exit(1) - - sys.exit(0) diff --git a/eva-accession-clustering-automation/clustering_automation/create_clustering_properties.py b/eva-accession-clustering-automation/clustering_automation/create_clustering_properties.py deleted file mode 100644 index c9f66736f..000000000 --- a/eva-accession-clustering-automation/clustering_automation/create_clustering_properties.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright 2020 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import argparse -import logging -from ebi_eva_common_pyutils.config_utils import get_properties_from_xml_file - -logger = logging.getLogger(__name__) - - -def create_properties_file(source, vcf_file, project_accession, assembly_accession, private_config_xml_file, profile, - output_directory, instance, enable_retryable=False): - """ - This method creates the application properties file - """ - check_vcf_source_requirements(source, vcf_file, project_accession) - properties = get_properties_from_xml_file(profile, private_config_xml_file) - path = get_properties_path(source, vcf_file, project_accession, assembly_accession, output_directory) - with open(path, 'w') as properties_file: - add_clustering_properties(properties_file, assembly_accession, project_accession, source, enable_retryable) - add_accessioning_properties(properties_file, instance) - add_count_service_properties(properties_file, properties) - add_mongo_properties(properties_file, properties) - add_job_tracker_properties(properties_file, properties) - add_spring_properties(properties_file) - return path - - -def get_properties_path(source, vcf_file, project_accession, assembly_accession, output_directory): - path = output_directory + '/' + assembly_accession - if source.upper() == 'VCF': - path += '_' + os.path.basename(vcf_file) + '_' + project_accession - path += '.properties' - return path - - -def add_clustering_properties(properties_file, assembly_accession, project_accession, vcf_file, enable_retryable): - vcf = vcf_file or '' - project = project_accession or '' - - clustering_properties = (f""" -parameters.assemblyAccession={assembly_accession} -parameters.remappedFrom= -parameters.vcf={vcf} -parameters.projectAccession={project} -parameters.allowRetry={str(enable_retryable).lower()} -parameters.projects= -parameters.rsReportPath={assembly_accession}_rs_report.txt -""") - properties_file.write(clustering_properties) - - -def add_accessioning_properties(properties_file, instance): - properties_file.write(f""" -parameters.chunkSize=200 - -accessioning.instanceId=instance-{instance} -accessioning.submitted.categoryId=ss -accessioning.clustered.categoryId=rs - -accessioning.monotonic.ss.blockSize=100000 -accessioning.monotonic.ss.blockStartValue=5000000000 -accessioning.monotonic.ss.nextBlockInterval=1000000000 -accessioning.monotonic.rs.blockSize=100000 -accessioning.monotonic.rs.blockStartValue=3000000000 -accessioning.monotonic.rs.nextBlockInterval=1000000000 -""") - - -def add_spring_properties(properties_file): - properties_file.write(""" -#See https://github.com/spring-projects/spring-boot/wiki/Spring-Boot-2.1-Release-Notes#bean-overriding -spring.main.allow-bean-definition-overriding=true -#As this is a spring batch application, disable the embedded tomcat. This is the new way to do that for spring 2. -spring.main.web-application-type=none - -# This entry is put just to avoid a warning message in the logs when you start the spring-boot application. -# This bug is from hibernate which tries to retrieve some metadata from postgresql db and failed to find that and logs as a warning -# It doesnt cause any issue though. -spring.jpa.properties.hibernate.jdbc.lob.non_contextual_creation = true""") - - -def add_mongo_properties(properties_file, properties): - mongo_hosts_and_ports = str(properties['eva.mongo.host']) - mongo_host, mongo_port = get_mongo_primary_host_and_port(mongo_hosts_and_ports) - mongo_database = str(properties['eva.accession.mongo.database']) - mongo_username = str(properties['eva.mongo.user']) - mongo_password = str(properties['eva.mongo.passwd']) - - mongo_properties = (""" -spring.data.mongodb.host={host} -spring.data.mongodb.port={port} -spring.data.mongodb.database={database} -spring.data.mongodb.username={username} -spring.data.mongodb.password={password} -spring.data.mongodb.authentication-database=admin -mongodb.read-preference=primary - """).format(database=mongo_database, username=mongo_username, password=mongo_password, host=mongo_host, - port=mongo_port) - properties_file.write(mongo_properties) - -def add_count_service_properties(properties_file, properties): - count_service_url = str(properties['eva.count-stats.url']) - count_service_username = str(properties['eva.count-stats.username']) - count_service_password = str(properties['eva.count-stats.password']) - - count_service_properties = (""" -eva.count-stats.url={url} -eva.count-stats.username={username} -eva.count-stats.password={password} - """).format(url=count_service_url, username=count_service_username, password=count_service_password) - properties_file.write(count_service_properties) - - -def get_mongo_primary_host_and_port(mongo_hosts_and_ports): - """ - :param mongo_hosts_and_ports: All host and ports stored in the private settings xml - :return: mongo primary host and port - """ - for host_and_port in mongo_hosts_and_ports.split(','): - if '001' in host_and_port: - properties = host_and_port.split(':') - return properties[0], properties[1] - - -def add_job_tracker_properties(properties_file, properties): - postgres_url = str(properties['eva.accession.jdbc.url']) - postgres_username = str(properties['eva.accession.user']) - postgres_password = str(properties['eva.accession.password']) - - postgres_properties = (""" -spring.datasource.driver-class-name=org.postgresql.Driver -spring.datasource.url={postgres_url} -spring.datasource.username={postgres_username} -spring.datasource.password={postgres_password} -spring.datasource.tomcat.max-active=3 - """).format(postgres_url=postgres_url, postgres_username=postgres_username, postgres_password=postgres_password) - properties_file.write(postgres_properties) - - -def check_vcf_source_requirements(source, vcf_file, project_accession): - """ - This method checks that if the source is VCF the VCF file and project accession are provided - """ - if source == 'VCF' and not (vcf_file and project_accession): - raise ValueError('If the source is VCF the file path and project accession must be provided') - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Create clustering properties file', add_help=False) - parser.add_argument("--source", help="mongo database or VCF", required=True, choices=['VCF', 'MONGO']) - parser.add_argument("--vcf-file", help="Path to the VCF file, required when the source is VCF", required=False) - parser.add_argument("--project-accession", help="Project accession, required when the source is VCF", - required=False) - parser.add_argument("--assembly-accession", help="Assembly for which the process has to be run, " - "e.g. GCA_000002285.2", required=True) - parser.add_argument("--instance", help="Accessioning instance id", required=False, default=1, - type=int, choices=range(1, 13)) - parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) - parser.add_argument("--profile", help="Profile to get the properties, e.g.production", required=True) - parser.add_argument("--output-directory", help="Output directory for the properties file", required=False) - parser.add_argument("--enable-retryable", help="Set the clustering to use the retryable reader", default=False, - action='store_true') - parser.add_argument('--help', action='help', help='Show this help message and exit') - - args = {} - try: - args = parser.parse_args() - create_properties_file(args.source, args.vcf_file, args.project_accession, args.assembly_accession, - args.private_config_xml_file, args.profile, args.output_directory, args.instance, - args.enable_retryable) - except Exception as ex: - logger.exception(ex) - sys.exit(1) - - sys.exit(0) diff --git a/eva-accession-clustering-automation/clustering_automation/update_clustering_status.py b/eva-accession-clustering-automation/clustering_automation/update_clustering_status.py deleted file mode 100644 index 19bc73ddc..000000000 --- a/eva-accession-clustering-automation/clustering_automation/update_clustering_status.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2021 EMBL - European Bioinformatics Institute -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import argparse -import datetime -import logging - -from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle -from ebi_eva_common_pyutils.pg_utils import execute_query - - -logger = logging.getLogger(__name__) -timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - - -def set_clustering_status(private_config_xml_file, clustering_tracking_table, assembly, tax_id, release_version, status): - now = datetime.datetime.now().isoformat() - update_status_query = f"UPDATE {clustering_tracking_table} " - update_status_query += f"SET clustering_status='{status}'" - if status == 'Started': - update_status_query += f", clustering_start='{now}'" - elif status == 'Completed': - update_status_query += f", clustering_end='{now}'" - update_status_query += (f" WHERE assembly_accession='{assembly}' AND taxonomy='{tax_id}' " - f"AND release_version={release_version}") - with get_metadata_connection_handle("production_processing", private_config_xml_file) as metadata_connection_handle: - execute_query(metadata_connection_handle, update_status_query) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Update clustering progress', add_help=False) - parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) - parser.add_argument("--clustering-tracking-table", help="", required=True) - parser.add_argument("--release", help="Release version", required=True) - parser.add_argument("--assembly", help="Assembly accession", required=True) - parser.add_argument("--taxonomy", help="Taxonomy id", required=True) - parser.add_argument("--status", help="Status to set", required=True, choices=["Started", "Completed", "Failed"]) - args = {} - try: - args = parser.parse_args() - set_clustering_status(args.private_config_xml_file, args.clustering_tracking_table, args.assembly, args.taxonomy, args.release, args.status) - except Exception as ex: - logger.exception(ex) - sys.exit(1) - - sys.exit(0) diff --git a/eva-accession-clustering-automation/requirements.txt b/eva-accession-clustering-automation/requirements.txt deleted file mode 100644 index 3e2b31ef6..000000000 --- a/eva-accession-clustering-automation/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -ebi_eva_common_pyutils==0.3.16 -retry diff --git a/eva-accession-clustering-automation/setup.py b/eva-accession-clustering-automation/setup.py deleted file mode 100644 index 8de946510..000000000 --- a/eva-accession-clustering-automation/setup.py +++ /dev/null @@ -1,19 +0,0 @@ -import os -from setuptools import find_packages, setup - - -def get_requires(): - requires = [] - with open(os.path.join(os.path.dirname(__file__), "requirements.txt"), "rt") as req_file: - for line in req_file: - requires.append(line.rstrip()) - return requires - - -setup(name='clustering_automation', - version='0.0.1', - packages=find_packages(), - install_requires=get_requires(), - tests_require=get_requires(), - setup_requires=get_requires() -) diff --git a/eva-accession-clustering-automation/clustering_automation/create_clustering_progress_table.py b/eva-accession-release-automation/create_clustering_progress_table.py similarity index 100% rename from eva-accession-clustering-automation/clustering_automation/create_clustering_progress_table.py rename to eva-accession-release-automation/create_clustering_progress_table.py