From 732ac4d861e5e0c7396174827a87df2539624277 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 30 Jul 2024 01:10:35 +0200 Subject: [PATCH] output separate files --- .../config.vsh.yaml | 347 +++++++++++++++++- .../bd_rhapsody_sequence_analysis/script.py | 140 +++---- .../bd_rhapsody_sequence_analysis/test.sh | 34 +- 3 files changed, 431 insertions(+), 90 deletions(-) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml index 91146286..fd17476a 100644 --- a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml @@ -116,20 +116,317 @@ argument_groups: info: config_key: Supplemental_Reference - name: Outputs + description: Outputs for all pipeline runs + # based on https://bd-rhapsody-bioinfo-docs.genomics.bd.com/outputs/top_outputs.html arguments: - # TODO: split up into different outputs - # - ${run_name}_Bioproduct_Stats.csv - # - ${run_name}_Metrics_Summary.csv - # - ${run_name}_RSEC_MolsPerCell_Unfiltered_MEX.zip - # - ${run_name}_Logs/ - # TODO: determine additional outputs for ABC, ATAC, VDJ, SMK - - name: "--output" + - name: "--output_dir" type: file direction: output alternatives: [-o] - description: "Output folder. Output still needs to be processed further." + description: "The unprocessed output directory containing all the outputs from the pipeline." required: true example: output_dir/ + - name: "--output_seurat" + type: file + direction: output + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing RSEC molecules data table and all cell annotation metadata." + example: output_seurat.rds + required: false + info: + template: "[sample_name]_Seurat.rds" + - name: "--output_mudata" + type: file + direction: output + description: "Single-cell analysis tool inputs. Scanpy / Muon input file containing RSEC molecules data table and all cell annotation metadata." + example: output_mudata.h5mu + required: false + info: + template: "[sample_name].h5mu" + - name: "--metrics_summary" + type: file + direction: output + description: "Metrics Summary. Report containing sequencing, molecules, and cell metrics." + example: metrics_summary.csv + required: false + info: + template: "[sample_name]_Metrics_Summary.csv" + - name: "--pipeline_report" + type: file + direction: output + description: "Pipeline Report. Summary report containing the results from the sequencing analysis pipeline run." + example: pipeline_report.html + required: false + info: + template: "[sample_name]_Pipeline_Report.html" + - name: "--rsec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on RSEC" + example: RSEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_MEX.zip" + - name: "--dbec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + example: DBEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_DBEC_MolsPerCell_MEX.zip" + - name: "--rsec_mols_per_cell_unfiltered" + type: file + direction: output + description: "Unfiltered tables containing all cell labels with ≥10 reads." + example: RSEC_MolsPerCell_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_Unfiltered_MEX.zip" + - name: "--bam" + type: file + direction: output + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + example: BioProduct.bam + required: false + info: + template: "[sample_name]_Bioproduct.bam" + - name: "--bam_inddex" + type: file + direction: output + description: "Index file for the alignment file." + example: BioProduct.bam.bai + required: false + info: + template: "[sample_name]_Bioproduct.bam.bai" + - name: "--bioproduct_stats" + type: file + direction: output + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier adjustment algorithms on a per-bioproduct basis." + example: Bioproduct_Stats.csv + required: false + info: + template: "[sample_name]_Bioproduct_Stats.csv" + - name: "--dimred_tsne" + type: file + direction: output + description: "t-SNE dimensionality reduction coordinates per cell index" + example: tSNE_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_tSNE_coordinates.csv" + - name: "--dimred_umap" + type: file + direction: output + description: "UMAP dimensionality reduction coordinates per cell index" + example: UMAP_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_UMAP_coordinates.csv" + - name: "--immune_cell_classification" + type: file + direction: output + description: "Immune Cell Classification. Cell type classification based on the expression of immune cell markers." + example: Immune_Cell_Classification.csv + required: false + info: + template: "[sample_name]_(assay)_cell_type_experimental.csv" + - name: Multiplex outputs + description: Outputs when multiplex option is selected + arguments: + - name: "--sample_tag_metrics" + type: file + direction: output + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + example: Sample_Tag_Metrics.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Metrics.csv" + - name: "--sample_tag_calls" + type: file + direction: output + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + example: Sample_Tag_Calls.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Calls.csv" + - name: "--sample_tag_counts" + type: file + direction: output + description: "Sample Tag Counts. Separate data tables and metric summary for cells assigned to each sample tag. Note: For putative cells that could not be assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + example: Sample_Tag1.zip + required: false + multiple: true + info: + template: "[sample_name]_Sample_Tag[number].zip" + - name: "--sample_tag_counts_unassigned" + type: file + direction: output + description: "Sample Tag Counts Unassigned. Data table and metric summary for cells that could not be assigned a specific Sample Tag." + example: Multiplet_and_Undetermined.zip + required: false + info: + template: "[sample_name]_Multiplet_and_Undetermined.zip" + - name: VDJ Outputs + description: Outputs when VDJ option selected + arguments: + - name: "--vdj_metrics" + type: file + direction: output + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + example: VDJ_Metrics.csv + required: false + info: + template: "[sample_name]_VDJ_Metrics.csv" + - name: "--vdj_per_cell" + type: file + direction: output + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell.csv + required: false + info: + template: "[sample_name]_VDJ_perCell.csv" + - name: "--vdj_per_cell_uncorrected" + type: file + direction: output + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell_uncorrected.csv + required: false + info: + template: "[sample_name]_VDJ_perCell_uncorrected.csv" + - name: "--vdj_dominant_contigs" + type: file + direction: output + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain type combination (putative cells only)." + example: VDJ_Dominant_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Dominant_Contigs_AIRR.csv" + - name: "--vdj_unfiltered_contigs" + type: file + direction: output + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated successfully (all cells)." + example: VDJ_Unfiltered_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Unfiltered_Contigs_AIRR.csv" + - name: "ATAC-Seq outputs" + description: Outputs when ATAC-Seq option selected + arguments: + - name: "--atac_metrics" + type: file + direction: output + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + example: ATAC_Metrics.csv + required: false + info: + template: "[sample_name]_ATAC_Metrics.csv" + - name: "--atac_metrics_json" + type: file + direction: output + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON format." + example: ATAC_Metrics.json + required: false + info: + template: "[sample_name]_ATAC_Metrics.json" + - name: "--atac_fragments" + type: file + direction: output + description: "ATAC Fragments. Chromosomal location, cell index, and read support for each fragment detected" + example: ATAC_Fragments.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz" + - name: "--atac_fragments_index" + type: file + direction: output + description: "Index of ATAC Fragments." + example: ATAC_Fragments.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz.tbi" + - name: "--atac_transposase_sites" + type: file + direction: output + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read support for each transposase site detected" + example: ATAC_Transposase_Sites.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz" + - name: "--atac_transposase_sites_index" + type: file + direction: output + description: "Index of ATAC Transposase Sites." + example: ATAC_Transposase_Sites.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz.tbi" + - name: "--atac_peaks" + type: file + direction: output + description: "ATAC Peaks. Peak regions of transposase activity" + example: ATAC_Peaks.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz" + - name: "--atac_peaks_index" + type: file + direction: output + description: "Index of ATAC Peaks." + example: ATAC_Peaks.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz.tbi" + - name: "--atac_peak_annotation" + type: file + direction: output + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + example: peak_annotation.tsv.gz + required: false + info: + template: "[sample_name]_peak_annotation.tsv.gz" + - name: "--atac_cell_by_peak" + type: file + direction: output + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + example: ATAC_Cell_by_Peak_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_MEX.zip" + - name: "--atac_cell_by_peak_unfiltered" + type: file + direction: output + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels with >=1 transposase sites in peaks." + example: ATAC_Cell_by_Peak_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + - name: "--atac_bam" + type: file + direction: output + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations for ATAC-Seq. Only output if the BAM generation flag is set to true." + example: ATAC.bam + required: false + info: + template: "[sample_name]_ATAC.bam" + - name: "--atac_bam_index" + type: file + direction: output + description: "Index of ATAC BAM." + example: ATAC.bam.bai + required: false + info: + template: "[sample_name]_ATAC.bam.bai" + - name: AbSeq Cell Calling outputs + description: Outputs when Cell Calling Abseq is selected + arguments: + - name: "--protein_aggregates_experimental" + type: file + direction: output + description: "Protein Aggregates Experimental" + example: Protein_Aggregates_Experimental.csv + required: false + info: + template: "[sample_name]_Protein_Aggregates_Experimental.csv" - name: Putative Cell Calling Settings arguments: - name: "--cell_calling_data" @@ -298,9 +595,37 @@ argument_groups: - name: "--timestamps" type: boolean_true description: "Add timestamps to the errors, warnings, and notifications." - - name: "--dryrun" - type: boolean_true - description: "If true, the output directory will only contain the CWL input files, but the pipeline itself will not be executed." + - name: Undocumented arguments + arguments: + - name: --abseq_umi + type: integer + multiple: false + info: + config_key: AbSeq_UMI + - name: --target_analysis + type: boolean + multiple: false + info: + config_key: Target_analysis + - name: --vdj_jgene_evalue + type: double + description: | + e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_JGene_Evalue + - name: --vdj_vgene_evalue + type: double + description: | + e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_VGene_Evalue + - name: --write_filtered_reads + type: boolean + multiple: false + info: + config_key: Write_Filtered_Reads resources: - type: python_script path: script.py diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py index 71eb4bc5..416cabb0 100644 --- a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py @@ -4,6 +4,8 @@ import tempfile from typing import Any import yaml +import shutil +import glob ## VIASH START par = { @@ -60,19 +62,20 @@ def read_config(path: str) -> dict[str, Any]: return config -def logger(msg: str): - print(msg, flush=True) - def strip_margin(text: str) -> str: return re.sub('(\n?)[ \t]*\|', '\\1', text) -def process_params(par: dict[str, Any], config) -> str: +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: # check input parameters assert par["reads"] or par["reads_atac"], "Pass at least one set of inputs to --reads or --reads_atac." + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + # checking sample prefix if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): - logger("--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.") + print("--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", flush=True) par["run_name"] = re.sub("[^A-Za-z0-9\\-]", "-", par["run_name"]) # make paths absolute @@ -121,13 +124,20 @@ def generate_config(par: dict[str, Any], config) -> str: ## Write config to file return ''.join(content_list) -def generate_cwl_file(par: dict[str, Any], meta: dict[str, Any]) -> str: +def generate_config_file(par: dict[str, Any], config: dict[str, Any], temp_dir: str) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: # create cwl file (if need be) orig_cwl_file=os.path.join(meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl") # Inject computational requirements into pipeline if meta["memory_mb"] or meta["cpus"]: - cwl_file = os.path.join(par["output"], "pipeline.cwl") + cwl_file = os.path.join(dir, "pipeline.cwl") # Read in the file with open(orig_cwl_file, 'r') as file : @@ -148,23 +158,43 @@ def generate_cwl_file(par: dict[str, Any], meta: dict[str, Any]) -> str: return cwl_file -def main(par: dict[str, Any], meta: dict[str, Any]): +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and par["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = arg.get("info", {}).get("template") + if template: + template_glob = template\ + .replace("[sample_name]", par["run_name"])\ + .replace("(assay)", "*")\ + .replace("[number]", "*") + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if len(files) == 0 and arg["required"]: + raise ValueError(f"Expected output file '{template_glob}' not found.") + elif len(files) > 1 and not arg["multiple"]: + raise ValueError(f"Expected single output file '{template_glob}', but found multiple.") + + if not arg["multiple"]: + shutil.copy(files[0], par_value) + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): config = read_config(meta["config"]) - + # Preprocess params - par = process_params(par, config) - - # Create output dir if not exists - if not os.path.exists(par["output"]): - os.makedirs(par["output"]) + par = process_params(par, config, temp_dir) ## Process parameters cmd = [ "cwl-runner", "--no-container", "--preserve-entire-environment", - "--outdir", - par["output"] + "--outdir", par["output_dir"], ] if par["parallel"]: @@ -174,56 +204,32 @@ def main(par: dict[str, Any], meta: dict[str, Any]): cmd.append("--timestamps") # Create cwl file (if need be) - cwl_file = generate_cwl_file(par, meta) - - ## Run pipeline - if not par["dryrun"]: - with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: - # Create params file - config_file = os.path.join(temp_dir, "config.yml") - config_content = generate_config(par, config) - with open(config_file, "w") as f: - f.write(config_content) - - # add cwl and params file to the cmd - cmd.extend([cwl_file, config_file]) - - # keep environment variables but set TMPDIR to temp_dir - env = dict(os.environ) - env["TMPDIR"] = temp_dir - - logger("> " + ' '.join(cmd)) - _ = subprocess.check_call( - cmd, - cwd=os.path.dirname(config_file), - env=env - ) - - # maybe this won't be necessary anymore - # # extracting feature ids from references - # # extract info from reference files (while they still exist) - # feature_df = extract_feature_types(par) - # feature_types_file = os.path.join(par["output"], "feature_types.tsv") - # feature_df.to_csv(feature_types_file, sep="\t", index=False) - - # if not par["dryrun"]: - # # look for counts file - # if not par["run_name"]: - # par["run_name"] = "sample" - # counts_filename = par["run_name"] + "_RSEC_MolsPerCell.csv" - - # if par["sample_tags_version"]: - # counts_filename = "Combined_" + counts_filename - # counts_file = os.path.join(par["output"], counts_filename) - - # if not os.path.exists(counts_file): - # raise ValueError(f"Could not find output counts file '{counts_filename}'") - - # # look for metrics file - # metrics_filename = par["run_name"] + "_Metrics_Summary.csv" - # metrics_file = os.path.join(par["output"], metrics_filename) - # if not os.path.exists(metrics_file): - # raise ValueError(f"Could not find output metrics file '{metrics_filename}'") + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + ' '.join(cmd), flush=True) + _ = subprocess.check_call( + cmd, + cwd=os.path.dirname(config_file), + env=env + ) + + # Copy outputs + copy_outputs(par, config) if __name__ == "__main__": - main(par, meta) + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"]) as temp_dir: + main(par, meta, temp_dir) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.sh b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.sh index 4cfc95a6..c23fda86 100644 --- a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.sh +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.sh @@ -166,26 +166,36 @@ EOF ######################################################################################### + + # --reads ABCreads_R1.fq.gz \ + # --reads ABCreads_R2.fq.gz \ + # --abseq_reference bdabseq_smallpanel.fasta \ + + # --reads SMKreads_R1.fq.gz \ + # --reads SMKreads_R2.fq.gz \ + # --tag_names 1-Jurkat \ + # --tag_names 2-Ramos \ + # --tag_names 3-THP1 \ + + + # --cell_calling_data mRNA \ + # --target_analysis true \ + # --write_filtered_reads true + # --expected_cell_count 2 \ + echo ">> Run $meta_name" "$meta_executable" \ --reads WTAreads_R1.fq.gz \ --reads WTAreads_R2.fq.gz \ - --reads ABCreads_R1.fq.gz \ - --reads ABCreads_R2.fq.gz \ - --reads SMKreads_R1.fq.gz \ - --reads SMKreads_R2.fq.gz \ --reference_archive "$REFERENCE_FILE" \ - --abseq_reference bdabseq_smallpanel.fasta \ - --output output \ + --output_dir output \ ${meta_cpus:+---cpus $meta_cpus} \ ${meta_memory_mb:+---memory ${meta_memory_mb}MB} \ - --cell_calling_data mRNA \ + --reads ABCreads_R1.fq.gz \ + --reads ABCreads_R2.fq.gz \ + --abseq_reference bdabseq_smallpanel.fasta \ --exact_cell_count 2 \ - --expected_cell_count 2 \ - --exclude_intronic_reads false \ - --tag_names 1-Jurkat \ - --tag_names 2-Ramos \ - --tag_names 3-THP1 + --exclude_intronic_reads false # echo ">> Check if output exists" # assert_file_exists "output.bam"