Skip to content

Commit

Permalink
Merge pull request #1298 from nextstrain/feat/gene-cds
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Oct 31, 2023
2 parents 08b3fa2 + aea4cde commit f603fc1
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 47 deletions.
21 changes: 11 additions & 10 deletions packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ pub struct NextcladeRunInputArgs {
/// not be translated, amino acid sequences will not be output, amino acid mutations will not be detected and nucleotide sequence
/// alignment will not be informed by codon boundaries
///
/// List of genes can be restricted using `--genes` flag. Otherwise all genes found in the genome annotation will be used.
/// List of genes can be restricted using `--genes` flag. Otherwise, all genes found in the genome annotation will be used.
///
/// Overrides genome annotation provided by the dataset (`--input-dataset` or `--dataset-name`).
///
Expand All @@ -362,11 +362,12 @@ pub struct NextcladeRunInputArgs {
#[clap(value_hint = ValueHint::FilePath)]
pub input_annotation: Option<PathBuf>,

/// Comma-separated list of names of genes to use.
/// Comma-separated list of names of coding sequences (CDSes) to use.
///
/// This defines which peptides will be written into outputs, and which genes will be taken into account during
/// codon-aware alignment and aminoacid mutations detection. Must only contain gene names present in the genome annotation. If
/// this flag is not supplied or its value is an empty string, then all genes found in the genome annotation will be used.
/// codon-aware alignment and aminoacid mutations detection. Must only contain CDS names present in the genome annotation.
///
/// If this flag is not supplied or its value is an empty string, then all CDSes found in the genome annotation will be used.
///
/// Requires `--input-annotation` to be specified.
#[clap(
Expand All @@ -376,7 +377,7 @@ pub struct NextcladeRunInputArgs {
use_value_delimiter = true
)]
#[clap(value_hint = ValueHint::FilePath)]
pub genes: Option<Vec<String>>,
pub cds_selection: Option<Vec<String>>,

/// Use custom dataset server
#[clap(long)]
Expand Down Expand Up @@ -493,7 +494,7 @@ pub struct NextcladeRunOutputArgs {
///
/// Example for bash shell:
///
/// --output-translations='output_dir/gene_{gene}.translation.fasta'
/// --output-translations='output_dir/cds_{cds}.translation.fasta'
#[clap(long, short = 'P')]
#[clap(value_hint = ValueHint::AnyPath)]
pub output_translations: Option<String>,
Expand Down Expand Up @@ -815,7 +816,7 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result

if output_selection.contains(&NextcladeOutputSelection::Translations) {
let output_translations_path =
default_output_file_path.with_file_name(format!("{output_basename}_gene_{{gene}}"));
default_output_file_path.with_file_name(format!("{output_basename}.cds_translation.{{cds}}.fasta"));
let output_translations_path = add_extension(output_translations_path, "translation.fasta");

let output_translations_template = output_translations_path
Expand Down Expand Up @@ -852,17 +853,17 @@ pub fn nextclade_get_output_filenames(run_args: &mut NextcladeRunArgs) -> Result
}

if let Some(output_translations) = output_translations {
if !output_translations.contains("{gene}") {
if !output_translations.contains("{cds}") {
return make_error!(
r#"
Expected `--output-translations` argument to contain a template string containing template variable {{gene}} (with curly braces), but received:
Expected `--output-translations` argument to contain a template string containing template variable {{cds}} (with curly braces), but received:
{output_translations}
Make sure the variable is not substituted by your shell, programming language or workflow manager. Apply proper escaping as needed.
Example for bash shell:
--output-translations='output_dir/gene_{{gene}}.translation.fasta'
--output-translations='output_dir/cds_{{cds}}.translation.fasta'
"#
);
Expand Down
4 changes: 2 additions & 2 deletions packages_rs/nextclade-cli/src/cli/nextclade_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {

let NextcladeRunArgs {
inputs: NextcladeRunInputArgs {
input_fastas, genes, ..
input_fastas, cds_selection: cdses, ..
},
outputs:
NextcladeRunOutputArgs {
Expand All @@ -40,7 +40,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> {
other_params: NextcladeRunOtherParams { jobs },
} = run_args.clone();

let inputs = nextclade_get_inputs(&run_args, &genes)?;
let inputs = nextclade_get_inputs(&run_args, &cdses)?;
let nextclade = Nextclade::new(inputs, &params)?;

let should_write_tree = output_tree.is_some() || output_tree_nwk.is_some() || output_graph.is_some();
Expand Down
26 changes: 13 additions & 13 deletions packages_rs/nextclade-cli/src/dataset/dataset_download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ const PATHOGEN_JSON: &str = "pathogen.json";

pub fn nextclade_get_inputs(
run_args: &NextcladeRunArgs,
genes: &Option<Vec<String>>,
cdses: &Option<Vec<String>>,
) -> Result<NextcladeParams, Report> {
if let Some(dataset_name) = run_args.inputs.dataset_name.as_ref() {
dataset_str_download_and_load(run_args, genes)
dataset_str_download_and_load(run_args, cdses)
.wrap_err_with(|| format!("When downloading dataset '{dataset_name}'"))
} else if let Some(input_dataset) = run_args.inputs.input_dataset.as_ref() {
if input_dataset.is_file() && has_extension(input_dataset, "zip") {
dataset_zip_load(run_args, input_dataset, genes)
dataset_zip_load(run_args, input_dataset, cdses)
.wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}"))
} else if input_dataset.is_dir() {
dataset_dir_load(run_args, input_dataset, genes)
dataset_dir_load(run_args, input_dataset, cdses)
.wrap_err_with(|| format!("When loading dataset from {input_dataset:#?}"))
} else {
make_error!(
Expand All @@ -45,7 +45,7 @@ pub fn nextclade_get_inputs(
)
}
} else {
dataset_individual_files_load(run_args, genes)
dataset_individual_files_load(run_args, cdses)
}
}

Expand Down Expand Up @@ -93,7 +93,7 @@ pub fn read_from_path_or_zip(
pub fn dataset_zip_load(
run_args: &NextcladeRunArgs,
dataset_zip: impl AsRef<Path>,
genes: &Option<Vec<String>>,
cdses: &Option<Vec<String>>,
) -> Result<NextcladeParams, Report> {
let file = File::open(dataset_zip)?;
let buf_file = BufReader::new(file);
Expand All @@ -112,7 +112,7 @@ pub fn dataset_zip_load(
let gene_map = read_from_path_or_zip(&run_args.inputs.input_annotation, &mut zip, "genome_annotation.gff3")?
.map_ref_fallible(GeneMap::from_str)
.wrap_err("When reading genome annotation from dataset")?
.map(|gene_map| filter_gene_map(gene_map, genes))
.map(|gene_map| filter_gene_map(gene_map, cdses))
.unwrap_or_default();

let tree = read_from_path_or_zip(&run_args.inputs.input_tree, &mut zip, "tree.json")?
Expand Down Expand Up @@ -142,7 +142,7 @@ pub fn dataset_dir_download(http: &mut HttpClient, dataset: &Dataset, output_dir
pub fn dataset_dir_load(
run_args: &NextcladeRunArgs,
dataset_dir: impl AsRef<Path>,
genes: &Option<Vec<String>>,
cdses: &Option<Vec<String>>,
) -> Result<NextcladeParams, Report> {
let dataset_dir = dataset_dir.as_ref();

Expand Down Expand Up @@ -176,7 +176,7 @@ pub fn dataset_dir_load(
})
.map_ref_fallible(GeneMap::from_path)
.wrap_err("When reading genome annotation")?
.map(|gen_map| filter_gene_map(gen_map, genes))
.map(|gen_map| filter_gene_map(gen_map, cdses))
.unwrap_or_default();

let tree = input_tree
Expand All @@ -201,7 +201,7 @@ pub fn dataset_dir_load(

pub fn dataset_individual_files_load(
run_args: &NextcladeRunArgs,
genes: &Option<Vec<String>>,
cdses: &Option<Vec<String>>,
) -> Result<NextcladeParams, Report> {
match (&run_args.inputs.input_dataset, &run_args.inputs.input_ref) {
(None, None) => make_error!("When `--input-dataset` is not specified, --input-ref is required"),
Expand Down Expand Up @@ -273,7 +273,7 @@ pub fn dataset_individual_files_load(
.as_ref()
.map_ref_fallible(GeneMap::from_path)
.wrap_err("When reading genome annotation")?
.map(|gen_map| filter_gene_map(gen_map, genes))
.map(|gen_map| filter_gene_map(gen_map, cdses))
.unwrap_or_default();

let tree = run_args
Expand Down Expand Up @@ -317,7 +317,7 @@ pub fn read_from_path_or_url(

pub fn dataset_str_download_and_load(
run_args: &NextcladeRunArgs,
genes: &Option<Vec<String>>,
cdses: &Option<Vec<String>>,
) -> Result<NextcladeParams, Report> {
let verbose = log::max_level() > LevelFilter::Info;
let mut http = HttpClient::new(&run_args.inputs.server, &ProxyConfig::default(), verbose)?;
Expand Down Expand Up @@ -357,7 +357,7 @@ pub fn dataset_str_download_and_load(
)?
.map_ref_fallible(GeneMap::from_str)
.wrap_err("When reading genome annotation from dataset")?
.map(|gene_map| filter_gene_map(gene_map, genes))
.map(|gene_map| filter_gene_map(gene_map, cdses))
.unwrap_or_default();

let tree = read_from_path_or_url(
Expand Down
4 changes: 2 additions & 2 deletions packages_rs/nextclade-web/src/hooks/useExportResults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export const DEFAULT_EXPORT_PARAMS: ExportParams = {
filenameTreeNwk: 'nextclade.nwk',
filenameFasta: 'nextclade.aligned.fasta',
filenamePeptidesZip: 'nextclade.peptides.fasta.zip',
filenamePeptidesTemplate: 'nextclade.peptide.{{GENE}}.fasta',
filenamePeptidesTemplate: 'nextclade.cds_translation.{{cds}}.fasta',
}

function useResultsExport(exportFn: (filename: string, snapshot: Snapshot, worker: ExportWorker) => Promise<void>) {
Expand Down Expand Up @@ -215,7 +215,7 @@ async function preparePeptideFiles(snapshot: Snapshot) {
file.data = `${file.data}${fastaEntry}`
} else {
let filename = DEFAULT_EXPORT_PARAMS.filenamePeptidesTemplate
filename = filename.replace('{{GENE}}', name)
filename = filename.replace('{{cds}}', name)
filesMap.set(name, { filename, data: fastaEntry })
}
}
Expand Down
23 changes: 11 additions & 12 deletions packages_rs/nextclade/src/gene/gene_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,31 +160,30 @@ impl GeneMap {
}

/// Filters genome annotation according to the list of requested genes.
pub fn filter_gene_map(gene_map: GeneMap, genes: &Option<Vec<String>>) -> GeneMap {
if let Some(genes) = genes {
pub fn filter_gene_map(gene_map: GeneMap, cdses: &Option<Vec<String>>) -> GeneMap {
if let Some(cdses) = cdses {
let gene_map: BTreeMap<String, Gene> = gene_map
.into_iter_genes()
.filter(|(gene_name, ..)| genes.contains(gene_name))
.filter(|(gene_name, ..)| cdses.contains(gene_name))
.collect();

let requested_genes_not_in_genemap = get_requested_genes_not_in_genemap(&gene_map, genes);
if !requested_genes_not_in_genemap.is_empty() {
let requested_but_not_found = get_requested_cdses_not_in_genemap(&gene_map, cdses);
if !requested_but_not_found.is_empty() {
warn!(
"The following genes were requested through `--genes` \
but not found in the genome annotation: \
`{requested_genes_not_in_genemap}`",
"The following genes were requested through `--cdses` but not found in the genome annotation: {requested_but_not_found}",
);
}
return GeneMap::from_genes(gene_map);
}
gene_map
}

fn get_requested_genes_not_in_genemap(gene_map: &BTreeMap<String, Gene>, genes: &[String]) -> String {
genes
fn get_requested_cdses_not_in_genemap(gene_map: &BTreeMap<String, Gene>, cdses: &[String]) -> String {
cdses
.iter()
.filter(|&gene_name| !gene_map.contains_key(gene_name))
.join("`, `")
.filter(|&cds_name| !gene_map.contains_key(cds_name))
.map(|name| format!("'{name}'"))
.join(", ")
}

pub fn convert_feature_tree_to_gene_map(feature_tree: &FeatureTree) -> Result<GeneMap, Report> {
Expand Down
4 changes: 2 additions & 2 deletions packages_rs/nextclade/src/io/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ impl FastaWriter {

#[derive(Clone, Debug, Serialize)]
struct OutputTranslationsTemplateContext<'a> {
gene: &'a str,
cds: &'a str,
}

pub type FastaPeptideWritersMap = BTreeMap<String, FastaWriter>;
Expand All @@ -218,7 +218,7 @@ impl FastaPeptideWriter {
let writers = gene_map
.iter_cdses()
.map(|cds| -> Result<_, Report> {
let template_context = OutputTranslationsTemplateContext { gene: &cds.name };
let template_context = OutputTranslationsTemplateContext { cds: &cds.name };
let rendered_path = tt
.render("output_translations", &template_context)
.wrap_err_with(|| format!("When rendering output translations path template: '{output_translations}', using context: {template_context:?}"))?;
Expand Down
12 changes: 6 additions & 6 deletions tests/run-smoke-tests
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function run_with_dataset_dir() {

${NEXTCLADE_BIN} run --quiet --in-order --include-reference \
--input-dataset="${dataset_dir}" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand All @@ -56,7 +56,7 @@ function run_with_dataset_zip() {

${NEXTCLADE_BIN} run --quiet --in-order --include-reference \
--input-dataset="${dataset_dir}/dataset.zip" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand All @@ -71,7 +71,7 @@ function run_with_ref_only() {

${NEXTCLADE_BIN} run --quiet --in-order --include-reference \
--input-ref="${dataset_dir}/reference.fasta" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand All @@ -89,7 +89,7 @@ function run_with_ref_and_annotation() {
${NEXTCLADE_BIN} run --quiet --in-order --include-reference \
--input-ref="${dataset_dir}/reference.fasta" \
--input-annotation="${dataset_dir}/genome_annotation.gff3" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand All @@ -107,7 +107,7 @@ function run_with_ref_and_tree() {
${NEXTCLADE_BIN} run --quiet --in-order --include-reference \
--input-ref="${dataset_dir}/reference.fasta" \
--input-tree="${dataset_dir}/tree.json" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand All @@ -127,7 +127,7 @@ function run_with_ref_and_annotation_and_tree() {
--input-ref="${dataset_dir}/reference.fasta" \
--input-annotation="${dataset_dir}/genome_annotation.gff3" \
--input-tree="${dataset_dir}/tree.json" \
--output-translations="${out_dir}/translations/gene_{gene}.translation.fasta" \
--output-translations="${out_dir}/translations/{cds}.translation.fasta" \
--output-all="${out_dir}" \
"${sequences}"
}
Expand Down

1 comment on commit f603fc1

@vercel
Copy link

@vercel vercel bot commented on f603fc1 Oct 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

nextclade – ./

nextclade-git-master-nextstrain.vercel.app
nextclade.vercel.app
nextclade-nextstrain.vercel.app

Please sign in to comment.