Skip to content

Commit

Permalink
minor updates, upgrade to 0.3.0-alpha (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
richard-burhans authored Nov 14, 2024
1 parent 5587b06 commit 544ef29
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 64 deletions.
23 changes: 19 additions & 4 deletions tools/ncbi_egapx/macros.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
<macros>
<xml name="requirements">
<requirements>
<container type="docker">quay.io/richard-burhans/egapx:@TOOL_VERSION@</container>
<container type="docker">quay.io/galaxy/egapx:@TOOL_VERSION@</container>
</requirements>
</xml>
<token name="@TOOL_VERSION@">0.2-alpha</token>
<token name="@VERSION_SUFFIX@">4</token>
<token name="@TOOL_VERSION@">0.3.0-alpha</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="edam_ontology">
<edam_operations>
Expand All @@ -14,7 +14,22 @@
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.1093/bioinformatics/bts573</citation>
<citation type="bibtex">
@misc{githubegapx,
author = {NCBI},
year = "2024",
title = {Eukaryotic Genome Annotation Pipeline - External (EGAPx)},
publisher = {GitHub},
journal = {GitHub repository},
url = {https://github.com/ncbi/egapxm
}
</citation>
</citations>
</xml>
<xml name="creators">
<creator>
<person givenName="Ross" familyName="Lazarus" url="https://orcid.org/0000-0003-3939-1961"/>
<person givenName="Richard" familyName="Burhans" url="https://science.psu.edu/bmb/people/rcb112"/>
</creator>
</xml>
</macros>
117 changes: 60 additions & 57 deletions tools/ncbi_egapx/ncbi_egapx.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,51 @@
<expand macro="edam_ontology"/>
<expand macro="requirements"/>
<command detect_errors="aggressive"><![CDATA[
#if str($cond_input_style.input_style) == "fillform":
#set yamlconfig = "egapx.yaml"
echo '# yaml generated by ncbi_egapx.xml' > '$yamlconfig' &&
#if str($reference_genome.genome_type_select) == "history"
echo 'genome: $reference_genome.genome' >> '$yamlconfig' &&
#elif str($reference_genome.genome_type_select) == "indexed":
echo 'genome: $reference_genome.genome.fields.path' >> '$yamlconfig' &&
#else:
echo 'genome: $reference_genome.uri' >> '$yamlconfig' &&
#end if
echo 'taxid: $taxid' >> '$yamlconfig' &&
echo 'reads:' >> '$yamlconfig' &&
#if str($condrnaseq.rna_type_select) == "list":
#set rs = $rnaseq.split()
#set rsplit = [x.strip() for x in $rs]
#for $r in $rsplit:
echo ' - $r' >> '$yamlconfig' &&
#end for
#else:
#for $r in $rnaseq:
echo ' - $r' >> '$yamlconfig' &&
#end for
#end if
#if $proteins:
echo 'proteins: $proteins' >> '$yamlconfig' &&
#end if
#if len($xtra.strip()) > 0:
#set lxtra = $xtra.split("\n")
#for row in $lxtra:
echo '$row' >> '$yamlconfig' &&
#end for
#end if
echo '' >> '$yamlconfig' &&
echo "Calculated contents of egapx yaml" &&
cat '$yamlconfig' &&
#else:
#if str($cond_input_style.input_style) == "fillform"
#set yamlconfig = $egapx_config
#else
#set yamlconfig = $yamlin
#end if
## activate the following
## - nextflow conda environment
## - EGPAx python virtual environment
source /galaxy/env.bash &&
echo \${PATH} &&
## use the augmented container EGAPx config
ln -s /galaxy/egapx/egapx_config &&
## run EGAPx
python3 /galaxy/egapx/ui/egapx.py '$yamlconfig' -e galaxy -o 'egapx_out'
]]></command>
<configfiles>
<configfile name="egapx_config"><![CDATA[
#if str($cond_input_style.input_style) == "fillform"
# yaml generated by ncbi_egapx.xml
#if str($reference_genome.genome_type_select) == "history"
#set genome_value = $reference_genome.genome
#elif str($reference_genome.genome_type_select) == "indexed"
#set genome_value = $reference_genome.genome.fields.path
#else
#set genome_value = $reference_genome.uri
#end if
genome: $genome_value
taxid: $taxid
#if str($condrnaseq.rna_type_select) == "list"
#set $reads_values = $rnaseq.split()
#else
#set $reads_values = $rnaseq
#end if
reads:
#for r in [x.strip() for x in $reads_values]
- $r
#end for
#if str($proteins) != "None"
proteins: $proteins
#end if
#for row in $xtra.strip().split("\n")
$row
#end for
#end if
]]></configfile>
</configfiles>
<inputs>
<conditional name="cond_input_style">
<param name="input_style" type="select" label="Fill in a tool form or use an existing yaml configuration from the current history?"
Expand All @@ -59,13 +61,13 @@
<when value="fillform">
<conditional name="reference_genome">
<param name="genome_type_select" type="select" label="Reference genome source for mapping supplied RNA-seq reads"
help="Select a built in, history or remote URI for the reference genome fasta">
<option value="history" selected="True">Use a genome fasta file from the current history</option>
help="Select a built in, history or remote URI for the reference genome FASTA">
<option value="history" selected="True">Use a genome FASTA file from the current history</option>
<option value="indexed">Use a Galaxy server built-in genome</option>
<option value="uri">Provide a remote web link URI ("https://...") pointing at the required genome reference fasta file</option>
<option value="uri">Provide a remote web link URI ("https://...") pointing at the required genome reference FASTA file</option>
</param>
<when value="history">
<param name="genome" type="data" format="fasta" label="Select the reference genome fasta from the current history"/>
<param name="genome" type="data" format="fasta" label="Select the reference genome FASTA from the current history"/>
</when>
<when value="indexed">
<param name="genome" type="select" label="Select a built in reference genome or custom genome"
Expand All @@ -76,19 +78,19 @@
</param>
</when>
<when value="uri">
<param name="uri" type="text" label="URI pointing to the reference genome fasta file"/>
<param name="uri" type="text" label="URI pointing to the reference genome FASTA file"/>
</when>
</conditional>
<param name="taxid" type="text" label="NCBI Taxon ID" help="Used to identify the HMM model files needed"/>
<param name="taxid" type="integer" min="0" label="NCBI Taxon ID" help="Used to identify the HMM model files needed"/>
<conditional name="condrnaseq">
<param name="rna_type_select" type="select" label="RNA sequence data source"
help="Select RNAseq input data from history or input a list of SRA identifiers or remote URI">
<option value="list" selected="True">Type in a list of SRA identifiers and/or remote RNA-seq fasta URI</option>
<option value="list" selected="True">Type in a list of SRA identifiers and/or remote RNA-seq FASTA URI</option>
<option value="history">Select one or more RNA-seq fastq datasets from the current history</option>
</param>
<when value="list">
<param name="rnaseq" type="text" area="true" label="List all required individual RNA-seq URI or SRA identifiers, separated by spaces or newlines"
help="Either a working URI for a RNA-seq fasta, or a bare SRA identifier will work - can be mixed">
help="Either a working URI for a RNA-seq FASTA, or a bare SRA identifier will work - can be mixed">
<validator type="empty_field"/>
</param>
</when>
Expand All @@ -97,7 +99,7 @@
help="All selected rna-seq fastqsanger will be added to the yaml for egapx configuration"/>
</when>
</conditional>
<param name="proteins" type="data" format="fasta,tasta.gz" optional="true" label="Select a protein set"/>
<param name="proteins" type="data" format="fasta,fasta.gz" optional="true" label="Select a protein set"/>
<param name="xtra" type="text" area="true" label="Additional yaml to append to the egapx.yaml configuration"
help="Not normally needed but useful for testing additional configuration elements">
<sanitizer invalid_char="">
Expand All @@ -106,7 +108,7 @@
</param>
</when>
<when value="history">
<param name="yamlin" type="data" format="yaml,txt" label="egapx configuration yaml file to pass to Nextflow"/>
<param name="yamlin" type="data" format="yaml" label="egapx configuration yaml file to pass to Nextflow"/>
</when>
</conditional>
</inputs>
Expand All @@ -122,8 +124,13 @@
</outputs>
<tests>
<test expect_test_failure="true">
<param name="input_style" value="history"/>
<param name="yamlin" value="input.yaml"/>
<param name="input_style" value="fillform"/>
<param name="taxid" value="6954"/>
<param name="genome_type_select" value="uri"/>
<param name="uri" value="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz"/>
<param name="rna_type_select" value="list"/>
<param name="rnaseq" value="https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2"/>
<param name="xtra" value="hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params&#10;tasks:&#10; star_wnode:&#10; star_wnode: -cpus-per-worker 4"/>
<output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
<output_collection name="nextflow_stats" type="list">
<element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
Expand All @@ -134,13 +141,8 @@
</output_collection>
</test>
<test expect_test_failure="true">
<param name="input_style" value="fillform"/>
<param name="taxid" value="6954"/>
<param name="genome_type_select" value="uri"/>
<param name="uri" value="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz"/>
<param name="rna_type_select" value="list"/>
<param name="rnaseq" value="https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2"/>
<param name="xtra" value="proteins: []&#10;hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params&#10;tasks:&#10; star_wnode:&#10; star_wnode: -cpus-per-worker 4"/>
<param name="input_style" value="history"/>
<param name="yamlin" value="input.yaml"/>
<output name="output"><assert_contents><has_size min="1"/></assert_contents></output>
<output_collection name="nextflow_stats" type="list">
<element name="nf_log"><assert_contents><has_size min="1"/></assert_contents></element>
Expand Down Expand Up @@ -242,7 +244,7 @@ The current version is an alpha release with limited features and organism scope
EGAPx is the publicly accessible version of the updated NCBI [Eukaryotic Genome Annotation Pipeline](https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/).
EGAPx takes an assembly fasta file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models.
EGAPx takes an assembly FASTA file, a taxid of the organism, and RNA-seq data. Based on the taxid, EGAPx will pick protein sets and HMM models. The pipeline runs `miniprot` to align protein sequences, and `STAR` to align RNA-seq to the assembly. Protein alignments and RNA-seq read alignments are then passed to `Gnomon` for gene prediction. In the first step of `Gnomon`, the short alignments are chained together into putative gene models.
In the second step, these predictions are further supplemented by *ab-initio* predictions based on HMM models. The final annotation for the input assembly is produced as a `gff` file.
**Security Notice:**
Expand Down Expand Up @@ -310,4 +312,5 @@ In the log, each line denotes the process that completed in the workflow. The fi
2024-03-27 11:20:24 17127134 aligns.paf
]]></help>
<expand macro="citations"/>
<expand macro="creators"/>
</tool>
4 changes: 1 addition & 3 deletions tools/ncbi_egapx/test-data/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
# To limit the requirements you also need to use -e docker_minimal

genome: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/020/809/275/GCF_020809275.1_ASM2080927v1/GCF_020809275.1_ASM2080927v1_genomic.fna.gz
taxid: 6954
reads:
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.1
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR8506572.2
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.1
- https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/sample_data/Dermatophagoides_farinae_small/SRR9005248.2
taxid: 6954
proteins: []
hmm: https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/EGAP/gnomon/hmm_parameters/6956.params
tasks:
star_wnode:
star_wnode: -cpus-per-worker 4

0 comments on commit 544ef29

Please sign in to comment.