Skip to content

Commit

Permalink
Merge pull request #10 from tcezard/EVA3308_remap_for_all_taxids
Browse files Browse the repository at this point in the history
EVA-3308 Support multiple taxonomy ids when remapping
  • Loading branch information
tcezard authored Jan 15, 2024
2 parents 8eb8012 + c39f74c commit 431dce6
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 110 deletions.
3 changes: 0 additions & 3 deletions bin/add_target_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ def main():

load_config()

if not args.taxonomy or not args.target_assembly or not args.release_version:
raise ArgumentError(None, 'Must provide --taxonomy, --target_assembly, and --release_version')

job = AssemblyIngestionJob(args.taxonomy, args.target_assembly, args.release_version)
logging_config.add_stdout_handler()

Expand Down
204 changes: 118 additions & 86 deletions eva_assembly_ingestion/assembly_ingestion_job.py

Large diffs are not rendered by default.

20 changes: 13 additions & 7 deletions eva_assembly_ingestion/nextflow/remap_cluster.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def helpMessage() {
Remap one assembly version to another, cluster, and QC.
Inputs:
--taxonomy_id taxonomy id of submitted variants that needs to be remapped.
--taxonomy_list list of taxonomy id of submitted variants that needs to be remapped.
--source_assembly_accession assembly accession of the submitted variants are currently mapped to.
--target_assembly_accession assembly accession the submitted variants will be remapped to.
--species_name scientific name to be used for the species.
Expand All @@ -35,8 +35,8 @@ params.help = null
if (params.help) exit 0, helpMessage()

// Test input files
if (!params.taxonomy_id || !params.source_assembly_accession || !params.target_assembly_accession || !params.species_name || !params.genome_assembly_dir ) {
if (!params.taxonomy_id) log.warn('Provide the taxonomy id of the source submitted variants using --taxonomy_id')
if (!params.taxonomy_list || !params.source_assembly_accession || !params.target_assembly_accession || !params.species_name || !params.genome_assembly_dir ) {
if (!params.taxonomy_list) log.warn('Provide the taxonomy id of the source submitted variants using --taxonomy_list')
if (!params.source_assembly_accession) log.warn('Provide the source assembly using --source_assembly_accession')
if (!params.target_assembly_accession) log.warn('Provide the target assembly using --target_assembly_accession')
if (!params.species_name) log.warn('Provide a species name using --species_name')
Expand Down Expand Up @@ -123,7 +123,7 @@ process update_target_genome {


/*
* Extract the submitted variants to remap from the accesioning warehouse and store them in a VCF file.
* Extract the submitted variants to remap from the accessioning warehouse and store them in a VCF file.
*/
process extract_vcf_from_mongo {
memory "${params.memory}GB"
Expand All @@ -132,11 +132,12 @@ process extract_vcf_from_mongo {
input:
path source_fasta
path source_report
each taxonomy

output:
// Store both vcfs (eva and dbsnp), emit: one channel
path '*.vcf', emit: source_vcfs
path "${params.source_assembly_accession}_vcf_extractor.log", emit: log_filename
path "${params.source_assembly_accession}_${taxonomy}_vcf_extractor.log", emit: log_filename

publishDir "$params.output_dir/logs", overwrite: true, mode: "copy", pattern: "*.log*"

Expand All @@ -145,7 +146,8 @@ process extract_vcf_from_mongo {
--spring.config.location=file:${params.extraction_properties} \
--parameters.fasta=${source_fasta} \
--parameters.assemblyReportUrl=file:${source_report} \
> ${params.source_assembly_accession}_vcf_extractor.log
--parameters.taxonomy=${taxonomy}
> ${params.source_assembly_accession}_${taxonomy}_vcf_extractor.log
"""
}

Expand Down Expand Up @@ -330,7 +332,11 @@ workflow {
update_source_genome(params.source_assembly_accession, retrieve_source_genome.out.source_fasta,
retrieve_source_genome.out.source_report, params.remapping_config)
update_target_genome(retrieve_target_genome.out.target_fasta, retrieve_target_genome.out.target_report, params.remapping_config)
extract_vcf_from_mongo(update_source_genome.out.updated_source_fasta, update_source_genome.out.updated_source_report)
extract_vcf_from_mongo(
update_source_genome.out.updated_source_fasta,
update_source_genome.out.updated_source_report,
params.taxonomy_list
)
remap_variants(extract_vcf_from_mongo.out.source_vcfs.flatten(), update_source_genome.out.updated_source_fasta,
update_target_genome.out.updated_target_fasta)
ingest_vcf_into_mongo(remap_variants.out.remapped_vcfs, update_target_genome.out.updated_target_report)
Expand Down
15 changes: 11 additions & 4 deletions tests/nextflow-tests/java/FakeExtractionPipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ public class FakeExtractionPipeline {

public static void main(String[] args) {
String outString = "java -jar extraction.jar";
String inFile = null;
String accession = null;
String taxonomy = null;
for (String arg: args) {
outString += " " + arg;
if (arg.startsWith("--parameters.fasta="))
inFile = arg.substring("--parameters.fasta=".length(), arg.length()-"_custom.fa".length());
if (arg.startsWith("--parameters.fasta=")){
accession = arg.substring("--parameters.fasta=".length(), arg.length()-"_custom.fa".length());
}
if (arg.startsWith("--parameters.taxonomy=")){
taxonomy = arg.substring("--parameters.taxonomy=".length(), arg.length());
}
}
System.out.println(outString);
System.out.println(inFile);
System.out.println(accession);
System.out.println(taxonomy);
String inFile = accession + "_" + taxonomy;

// real pipeline gets this from properties
String outFile1 = inFile + "_dbsnp.vcf";
Expand Down
25 changes: 15 additions & 10 deletions tests/nextflow-tests/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ PATH=${SCRIPT_DIR}/bin:$PATH

printf "\e[32m===== REMAPPING AND CLUSTERING PIPELINE =====\e[0m\n"
nextflow run ${SOURCE_DIR}/eva_assembly_ingestion/nextflow/remap_cluster.nf -params-file test_config.yaml \
--taxonomy_id 1234 \
--source_assembly_accession GCA_0000001 \
--target_assembly_accession GCA_0000002 \
--species_name "Thingy thungus" \
Expand All @@ -27,15 +26,21 @@ nextflow run ${SOURCE_DIR}/eva_assembly_ingestion/nextflow/remap_cluster.nf -par
--remapping_required 1 \
--memory 2

ls ${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_eva_remapped_counts.yml

# Test we have 7 log files in the logs directory (1 extraction, 2 ingestion, 3 clustering, 1 backpropagate)
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.log" | wc -l) -eq 7 ]]
ls ${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1233_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1233_eva_remapped_counts.yml \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/dbsnp/GCA_0000001_1234_dbsnp_remapped_counts.yml \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped_unmapped.vcf \
${SCRIPT_DIR}/output/eva/GCA_0000001_1234_eva_remapped_counts.yml

# Test we have 10 log files in the logs directory (2 extraction, 4 ingestion, 3 clustering, 1 backpropagate)
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.log" | wc -l) -eq 10 ]]

# Test we have 1 rs_report in the logs directory
[[ $(find ${SCRIPT_DIR}/output/logs/ -type f -name "*.txt" | wc -l) -eq 1 ]]
Expand Down
1 change: 1 addition & 0 deletions tests/nextflow-tests/test_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
taxonomy_list: [1233, 1234]

executable:
genome_downloader: ../../../bin/fake_genome_downloader.py
Expand Down

0 comments on commit 431dce6

Please sign in to comment.