Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3567 - Use a static nextflow pipeline #445

Merged
merged 8 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion eva-accession-release-automation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ pip install -e /path/to/eva-accession/eva-accession-release-automation
The release automation script `run_release_for_species.py` has the following parameters:
* **common-release-properties-file:** Path to yaml config file, see below
* **taxonomy-id:** Taxonomy to release
* **memory:** Amount of memory to use when running the release jobs

You also need to set `PYTHONPATH=/path/to/eva-accession/eva-accession-release-automation/run_release_in_embassy`.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +0,0 @@
# Copyright 2020 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import sys


def init_logger():
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s')
result_logger = logging.getLogger(__name__)
return result_logger


logger = init_logger()
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def remote_db_is_empty(local_forwarded_port, assembly_accession, destination_db_
logger.info(f"check if: {assembly_accession} has been copied")
command = f"mongosh --eval 'use {destination_db_name}' --eval 'db.stats()' --port {local_forwarded_port} " \
"| grep 'dataSize' | awk '{print substr($2,1,length($2)-1)}'"
output = run_command_with_output(command)
output = run_command_with_output('Check that data exists in mongodb', command, return_process_output=True)
return output.strip() == '0'


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import signal
import traceback
from functools import lru_cache

from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.network_utils import get_available_local_port, forward_remote_port_to_local_port
from ebi_eva_common_pyutils.taxonomy import taxonomy

logger = logging.getLogger(__name__)
logger = logging_config.get_logger(__name__)


def open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, assembly,
Expand Down Expand Up @@ -90,5 +92,6 @@ def get_release_db_name_in_tempmongo_instance(taxonomy_id, assembly_accession):
return "acc_" + str(taxonomy_id) + "_" + assembly_accession.replace('.', '_')


@lru_cache
def get_release_folder_name(taxonomy_id):
return taxonomy.get_normalized_scientific_name_from_ensembl(taxonomy_id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env nextflow

nextflow.enable.dsl=2

workflow {
initiate_release_status_for_assembly('initiate') | copy_accessioning_collections_to_embassy | run_release_for_assembly | \
merge_dbsnp_eva_release_files | sort_bgzip_index_release_files | validate_release_vcf_files | \
analyze_vcf_validation_results | count_rs_ids_in_release_files | validate_rs_release_files | \
update_sequence_names_to_ena | update_release_status_for_assembly
}

process initiate_release_status_for_assembly {

label 'short_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
tcezard marked this conversation as resolved.
Show resolved Hide resolved
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.initiate_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1
"""
}

process copy_accessioning_collections_to_embassy {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.copy_accessioning_collections_to_embassy --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --dump-dir $params.dump_dir 1>> $params.log_file 2>&1
"""
}

process run_release_for_assembly {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.run_release_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --release-jar-path $params.jar.release_pipeline 1>> $params.log_file 2>&1
"""
}

process merge_dbsnp_eva_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.merge_dbsnp_eva_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process sort_bgzip_index_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.sort_bgzip_index_release_files --bgzip-path $params.executable.bgzip --bcftools-path $params.executable.bcftools --vcf-sort-script-path $params.executable.sort_vcf_sorted_chromosomes --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process validate_release_vcf_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.validate_release_vcf_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder --vcf-validator-path $params.executable.vcf_validator --assembly-checker-path $params.executable.vcf_assembly_checker 1>> $params.log_file 2>&1
"""
}

process analyze_vcf_validation_results {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.analyze_vcf_validation_results --species-release-folder $params.assembly_folder --assembly-accession $params.assembly 1>> $params.log_file 2>&1
"""
}

process count_rs_ids_in_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.count_rs_ids_in_release_files --count-ids-script-path $params.executable.count_ids_in_vcf --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process validate_rs_release_files {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.validate_rs_release_files --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --release-version $params.release_version --species-release-folder $params.assembly_folder 1>> $params.log_file 2>&1
"""
}

process update_sequence_names_to_ena {

label 'long_time', 'med_mem'

input:
val flag

output:
val true, emit: flag

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.update_sequence_names_to_ena --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --species-release-folder $params.assembly_folder --sequence-name-converter-path $params.executable.convert_vcf_file --bcftools-path $params.executable.bcftools 1>> $params.log_file 2>&1
"""
}

process update_release_status_for_assembly {

label 'short_time', 'med_mem'

input:
val flag

output:
val true, emit: flag11

script:
"""
export PYTHONPATH=$params.python_path
$params.executable.python.interpreter -m run_release_in_embassy.update_release_status_for_assembly --private-config-xml-file $params.maven.settings_file --profile $params.maven.environment --release-species-inventory-table eva_progress_tracker.clustering_release_tracker --taxonomy-id $params.taxonomy --assembly-accession $params.assembly --release-version $params.release_version 1>> $params.log_file 2>&1
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@
# limitations under the License.

import click
import logging
import sys
import traceback

from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.create_release_properties_file import create_release_properties_file_for_assembly
from run_release_in_embassy.release_common_utils import open_mongo_port_to_tempmongo, close_mongo_port_to_tempmongo
from ebi_eva_common_pyutils.command_utils import run_command_with_output


logger = logging.getLogger(__name__)
logger = logging_config.get_logger(__name__)


def run_release_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
Expand Down
Loading
Loading