Skip to content

Commit

Permalink
Merge pull request #445 from tcezard/EVA3567_static_nextflow
Browse files Browse the repository at this point in the history
EVA-3567 - Use a static nextflow pipeline
  • Loading branch information
tcezard authored May 14, 2024
2 parents 7dbd1fd + 14557c8 commit 948cf3e
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 187 deletions.
1 change: 0 additions & 1 deletion eva-accession-release-automation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ pip install -e /path/to/eva-accession/eva-accession-release-automation
The release automation script `run_release_for_species.py` has the following parameters:
* **common-release-properties-file:** Path to yaml config file, see below
* **taxonomy-id:** Taxonomy to release
* **memory:** Amount of memory to use when running the release jobs

You also need to set `PYTHONPATH=/path/to/eva-accession/eva-accession-release-automation/run_release_in_embassy`.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +0,0 @@
# Copyright 2020 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import sys


def init_logger():
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s')
result_logger = logging.getLogger(__name__)
return result_logger


logger = init_logger()
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def remote_db_is_empty(local_forwarded_port, assembly_accession, destination_db_
logger.info(f"check if: {assembly_accession} has been copied")
command = f"mongosh --eval 'use {destination_db_name}' --eval 'db.stats()' --port {local_forwarded_port} " \
"| grep 'dataSize' | awk '{print substr($2,1,length($2)-1)}'"
output = run_command_with_output(command)
output = run_command_with_output('Check that data exists in mongodb', command, return_process_output=True)
return output.strip() == '0'


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import signal
import traceback
from functools import lru_cache

from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.network_utils import get_available_local_port, forward_remote_port_to_local_port
from ebi_eva_common_pyutils.taxonomy import taxonomy

logger = logging.getLogger(__name__)
logger = logging_config.get_logger(__name__)


def open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, assembly,
Expand Down Expand Up @@ -90,5 +92,6 @@ def get_release_db_name_in_tempmongo_instance(taxonomy_id, assembly_accession):
return "acc_" + str(taxonomy_id) + "_" + assembly_accession.replace('.', '_')


@lru_cache
def get_release_folder_name(taxonomy_id):
return taxonomy.get_normalized_scientific_name_from_ensembl(taxonomy_id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env nextflow

nextflow.enable.dsl=2

workflow {
initiate_release_status_for_assembly('initiate') | copy_accessioning_collections_to_embassy | run_release_for_assembly | \
merge_dbsnp_eva_release_files | sort_bgzip_index_release_files | validate_release_vcf_files | \
analyze_vcf_validation_results | count_rs_ids_in_release_files | validate_rs_release_files | \
update_sequence_names_to_ena | update_release_status_for_assembly
}

process initiate_release_status_for_assembly {

label 'short_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.initiate_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1
"""
}

process copy_accessioning_collections_to_embassy {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.copy_accessioning_collections_to_embassy --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --dump-dir $params.dump_dir 1>> $params.log_file 2>&1
"""
}

process run_release_for_assembly {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1
"""
}

process merge_dbsnp_eva_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process sort_bgzip_index_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process validate_release_vcf_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1
"""
}

process analyze_vcf_validation_results {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1
"""
}

process count_rs_ids_in_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process validate_rs_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process update_sequence_names_to_ena {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1
"""
}

process update_release_status_for_assembly {

label 'short_time', 'med_mem'

input:
val flag

output:
val true, emit: flag11

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.update_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@
# limitations under the License.

import click
import logging
import sys
import traceback

from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.create_release_properties_file import create_release_properties_file_for_assembly
from run_release_in_embassy.release_common_utils import open_mongo_port_to_tempmongo, close_mongo_port_to_tempmongo
from ebi_eva_common_pyutils.command_utils import run_command_with_output


logger = logging.getLogger(__name__)
logger = logging_config.get_logger(__name__)


def run_release_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
Expand Down
Loading

0 comments on commit 948cf3e

Please sign in to comment.