diff --git a/CHANGELOG.md b/CHANGELOG.md index b31f43d9..07a83c15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ * `agat`: - `agat/agat_convert_genscan2gff`: convert a genscan file into a GFF file (PR #100). +* `bd_rhapsody/bd_rhapsody_sequence_analysis`: BD Rhapsody Sequence Analysis CWL pipeline (PR #96). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml index e596bf06..dc71262b 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -116,12 +116,11 @@ argument_groups: resources: - type: python_script path: script.py - - path: make_rhap_reference_2.2.1_nodocker.cwl test_resources: - type: bash_script path: test.sh - - path: test_data + - path: ../test_data requirements: commands: [ "cwl-runner" ] @@ -131,12 +130,19 @@ engines: image: bdgenomics/rhapsody:2.2.1 setup: - type: apt - packages: [procps] + packages: [procps, git] - type: python packages: [cwlref-runner, cwl-runner] - type: docker run: | - echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + mkdir /var/bd_rhapsody_cwl && \ + cd /var/bd_rhapsody_cwl && \ + git clone https://bitbucket.org/CRSwDev/cwl.git . && \ + git checkout 8feeace1141b24749ea6003f8e6ad6d3ad5232de + - type: docker + run: + - VERSION=$(ls -v /var/bd_rhapsody_cwl | grep '^v' | sed 's#v##' | tail -1) + - 'echo "bdgenomics/rhapsody: \"$VERSION\"" > /var/software_versions.txt' runners: - type: executable diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl deleted file mode 100644 index fead2c02..00000000 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl +++ /dev/null @@ -1,115 +0,0 @@ -requirements: - InlineJavascriptRequirement: {} -class: CommandLineTool -label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline -cwlVersion: v1.2 -doc: >- - The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file - - -baseCommand: run_reference_generator.sh -inputs: - Genome_fasta: - type: File[] - label: Reference Genome - doc: |- - Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. - inputBinding: - prefix: --reference-genome - shellQuote: false - Gtf: - type: File[] - label: Transcript Annotations - doc: |- - Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. - inputBinding: - prefix: --gtf - shellQuote: false - Extra_sequences: - type: File[]? - label: Extra Sequences - doc: |- - Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) - inputBinding: - prefix: --extra-sequences - shellQuote: false - Mitochondrial_Contigs: - type: string[]? - default: ["chrM", "chrMT", "M", "MT"] - label: Mitochondrial Contig Names - doc: |- - Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. - inputBinding: - prefix: --mitochondrial-contigs - shellQuote: false - Filtering_off: - type: boolean? - label: Turn off filtering - doc: |- - By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: - - protein_coding - - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) - - IG_LV_gene - - IG_V_gene - - IG_V_pseudogene - - IG_D_gene - - IG_J_gene - - IG_J_pseudogene - - IG_C_gene - - IG_C_pseudogene - - TR_V_gene - - TR_V_pseudogene - - TR_D_gene - - TR_J_gene - - TR_J_pseudogene - - TR_C_gene - If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. - inputBinding: - prefix: --filtering-off - shellQuote: false - WTA_Only: - type: boolean? - label: WTA only index - doc: Build a WTA only index, otherwise builds a WTA + ATAC index. - inputBinding: - prefix: --wta-only-index - shellQuote: false - Archive_prefix: - type: string? - label: Archive Prefix - doc: |- - A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. - inputBinding: - prefix: --archive-prefix - shellQuote: false - Extra_STAR_params: - type: string? - label: Extra STAR Params - doc: |- - Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. - Example: - --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 - inputBinding: - prefix: --extra-star-params - shellQuote: true - - Maximum_threads: - type: int? - label: Maximum Number of Threads - doc: |- - The maximum number of threads to use in the pipeline. By default, all available cores are used. - inputBinding: - prefix: --maximum-threads - shellQuote: false - -outputs: - - Archive: - type: File - doc: |- - A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. - id: Reference_Archive - label: Reference Files Archive - outputBinding: - glob: '*.tar.gz' - diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py index ca635508..dcbfe933 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -83,21 +83,21 @@ def generate_config(par: dict[str, Any], meta, config) -> str: for config_key, arg_type, par_value in config_key_value_pairs: if arg_type == "file": - str = strip_margin(f"""\ + content = strip_margin(f"""\ |{config_key}: |""") if isinstance(par_value, list): for file in par_value: - str += strip_margin(f"""\ + content += strip_margin(f"""\ | - class: File | location: "{file}" |""") else: - str += strip_margin(f"""\ + content += strip_margin(f"""\ | class: File | location: "{par_value}" |""") - content_list.append(str) + content_list.append(content) else: content_list.append(strip_margin(f"""\ |{config_key}: {par_value} @@ -108,9 +108,9 @@ def generate_config(par: dict[str, Any], meta, config) -> str: def get_cwl_file(meta: dict[str, Any]) -> str: # create cwl file (if need be) - cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + cwl_file="/var/bd_rhapsody_cwl/v2.2.1/Extra_Utilities/make_rhap_reference_2.2.1.cwl" - return cwl_file + return os.path.abspath(cwl_file) def main(par: dict[str, Any], meta: dict[str, Any]): config = read_config(meta["config"]) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh deleted file mode 100644 index 8d468064..00000000 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -TMP_DIR=/tmp/bd_rhapsody_make_reference -OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data - -# check if seqkit is installed -if ! command -v seqkit &> /dev/null; then - echo "seqkit could not be found" - exit 1 -fi - -# create temporary directory and clean up on exit -mkdir -p $TMP_DIR -function clean_up { - rm -rf "$TMP_DIR" -} -trap clean_up EXIT - -# fetch reference -ORIG_FA=$TMP_DIR/reference.fa.gz -if [ ! -f $ORIG_FA ]; then - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ - -O $ORIG_FA -fi - -ORIG_GTF=$TMP_DIR/reference.gtf.gz -if [ ! -f $ORIG_GTF ]; then - wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ - -O $ORIG_GTF -fi - -# create small reference -START=30000 -END=31500 -CHR=chr1 - -# subset to small region -seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ - seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa - -zcat "$ORIG_GTF" | \ - awk -v FS='\t' -v OFS='\t' " - \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { - \$4 = \$4 - $START + 1; - \$5 = \$5 - $START + 1; - print; - }" > $OUT_DIR/reference_small.gtf diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R new file mode 100644 index 00000000..e33b8ea7 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R @@ -0,0 +1,116 @@ +# Extract arguments from CWL file and write them to arguments.yaml +# +# This script: +# - reads the CWL file +# - extracts the main workflow arguments +# - compares cwl arguments to viash config arguments +# - writes the arguments to arguments.yaml +# +# It can be used to update the arguments in the viash config after an +# update to the CWL file has been made. +# +# Dependencies: tidyverse, jsonlite, yaml, dynutils +# +# Install dependencies: +# ```R +# install.packages(c("tidyverse", "jsonlite", "yaml", "dynutils")) +# ``` +# +# Usage: +# ```bash +# Rscript src/bd_rhapsody/bd_rhapsody_sequence_analysis/_process_cwl.R +# ``` + +library(tidyverse) + +# fetch and read cwl file +lines <- read_lines("https://bitbucket.org/CRSwDev/cwl/raw/8feeace1141b24749ea6003f8e6ad6d3ad5232de/v2.2.1/rhapsody_pipeline_2.2.1.cwl") +cwl_header <- lines[[1]] +cwl_obj <- jsonlite::fromJSON(lines[-1], simplifyVector = FALSE) + +# detect main workflow arguments +gr <- dynutils::list_as_tibble(cwl_obj$`$graph`) + +gr %>% print(n = 100) + +main <- gr %>% filter(gr$id == "#main") + +main_inputs <- main$inputs[[1]] + +input_ids <- main_inputs %>% map_chr("id") %>% gsub("^#main/", "", .) + +# check whether in config +config <- yaml::read_yaml("src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml") +config$all_arguments <- config$argument_groups %>% map("arguments") %>% list_flatten() +arg_names <- config$all_arguments %>% map_chr("name") %>% gsub("^--", "", .) + +# arguments in cwl but not in config +setdiff(tolower(input_ids), arg_names) + +# arguments in config but not in cwl +setdiff(arg_names, tolower(input_ids)) + +# create arguments from main_inputs +arguments <- map(main_inputs, function(main_input) { + input_id <- main_input$id %>% gsub("^#main/", "", .) + input_type <- main_input$type[[2]] + + if (is.list(input_type) && input_type$type == "array") { + multiple <- TRUE + input_type <- input_type$items + } else { + multiple <- FALSE + } + + if (is.list(input_type) && input_type$type == "enum") { + choices <- input_type$symbols %>% + gsub(paste0(input_type$name, "/"), "", .) + input_type <- "enum" + } else { + choices <- NULL + } + + description <- + if (is.null(main_input$label)) { + main_input$doc + } else if (is.null(main_input$doc)) { + main_input$label + } else { + paste0(main_input$label, ". ", main_input$doc) + } + + type_map <- c( + "float" = "double", + "int" = "integer", + "string" = "string", + "boolean" = "boolean", + "File" = "file", + "enum" = "string" + ) + + out <- list( + name = paste0("--", tolower(input_id)), + type = type_map[input_type], + # TODO: use summary when viash 0.9 is released + # summary = main_input$doc, + # description = main_input$doc, + description = description, + multiple = multiple, + choices = choices, + info = list( + config_key = input_id + ) + ) + + out[!sapply(out, is.null)] +}) + + + +yaml::write_yaml( + arguments, + "src/bd_rhapsody/bd_rhapsody_sequence_analysis/arguments.yaml", + handlers = list( + logical = yaml::verbatim_logical + ) +) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml new file mode 100644 index 00000000..eb3eaf38 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/config.vsh.yaml @@ -0,0 +1,661 @@ +name: bd_rhapsody_sequence_analysis +namespace: bd_rhapsody +description: | + BD Rhapsody Sequence Analysis CWL pipeline v2.2. + + This pipeline performs analysis of single-cell multiomic sequence read (FASTQ) data. The supported + sequencing libraries are those generated by the BD Rhapsody™ assay kits, including: Whole Transcriptome + mRNA (WTA), Targeted mRNA, AbSeq Antibody-Oligonucleotides (ABC), Single-Cell Multiplexing (SMK), + TCR/BCR (VDJ), and ATAC-Seq. +keywords: [rna-seq, single-cell, multiomic, atac-seq, targeted, abseq, tcr, bcr] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1 + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - name: "--reads" + type: file + description: | + Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: + + - WTA mRNA + - Targeted mRNA + - AbSeq + - Sample Multiplexing + - VDJ + + You may specify as many R1/R2 read pairs as you want. + required: false + multiple: true + example: + - WTALibrary_S1_L001_R1_001.fastq.gz + - WTALibrary_S1_L001_R2_001.fastq.gz + info: + config_key: Reads + - name: "--reads_atac" + type: file + description: | + Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. + You may specify as many R1/R2/I2 files as you want. + required: false + multiple: true + example: + - ATACLibrary_S2_L001_R1_001.fastq.gz + - ATACLibrary_S2_L001_R2_001.fastq.gz + - ATACLibrary_S2_L001_I2_001.fastq.gz + info: + config_key: Reads_ATAC + - name: References + description: | + Assay type will be inferred from the provided reference(s). + Do not provide both reference_archive and targeted_reference at the same time. + + Valid reference input combinations: + - reference_archive: WTA only + - reference_archive & abseq_reference: WTA + AbSeq + - reference_archive & supplemental_reference: WTA + extra transgenes + - reference_archive & abseq_reference & supplemental_reference: WTA + AbSeq + extra transgenes + - reference_archive: WTA + ATAC or ATAC only + - reference_archive & supplemental_reference: WTA + ATAC + extra transgenes + - targeted_reference: Targeted only + - targeted_reference & abseq_reference: Targeted + AbSeq + - abseq_reference: AbSeq only + + The reference_archive can be generated with the bd_rhapsody_make_reference component. + Alternatively, BD also provides standard references which can be downloaded from these locations: + + - Human: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Human_WTA_2023-02.tar.gz + - Mouse: https://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-WTA/Pipeline-version2.x_WTA_references/RhapRef_Mouse_WTA_2023-02.tar.gz + arguments: + - name: "--reference_archive" + type: file + description: | + Path to Rhapsody WTA Reference in the tar.gz format. + + Structure of the reference archive: + + - `BD_Rhapsody_Reference_Files/`: top level folder + - `star_index/`: sub-folder containing STAR index, that is files created with `STAR --runMode genomeGenerate` + - GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf" + example: "RhapRef_Human_WTA_2023-02.tar.gz" + required: false + info: + config_key: Reference_Archive + - name: "--targeted_reference" + type: file + description: | + Path to the targeted reference file in FASTA format. + example: "BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + multiple: true + info: + config_key: Targeted_Reference + - name: "--abseq_reference" + type: file + description: Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. + example: "AbSeq_reference.fasta" + multiple: true + info: + config_key: AbSeq_Reference + - name: "--supplemental_reference" + type: file + alternatives: [-s] + description: Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment. + example: "supplemental_reference.fasta" + multiple: true + info: + config_key: Supplemental_Reference + - name: Outputs + description: Outputs for all pipeline runs + # based on https://bd-rhapsody-bioinfo-docs.genomics.bd.com/outputs/top_outputs.html + arguments: + - name: "--output_dir" + type: file + direction: output + alternatives: [-o] + description: "The unprocessed output directory containing all the outputs from the pipeline." + required: true + example: output_dir/ + - name: "--output_seurat" + type: file + direction: output + description: "Single-cell analysis tool inputs. Seurat (.rds) input file containing RSEC molecules data table and all cell annotation metadata." + example: output_seurat.rds + required: false + info: + template: "[sample_name]_Seurat.rds" + - name: "--output_mudata" + type: file + direction: output + description: "Single-cell analysis tool inputs. Scanpy / Muon input file containing RSEC molecules data table and all cell annotation metadata." + example: output_mudata.h5mu + required: false + info: + template: "[sample_name].h5mu" + - name: "--metrics_summary" + type: file + direction: output + description: "Metrics Summary. Report containing sequencing, molecules, and cell metrics." + example: metrics_summary.csv + required: false + info: + template: "[sample_name]_Metrics_Summary.csv" + - name: "--pipeline_report" + type: file + direction: output + description: "Pipeline Report. Summary report containing the results from the sequencing analysis pipeline run." + example: pipeline_report.html + required: false + info: + template: "[sample_name]_Pipeline_Report.html" + - name: "--rsec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on RSEC" + example: RSEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_MEX.zip" + - name: "--dbec_mols_per_cell" + type: file + direction: output + description: "Molecules per bioproduct per cell bassed on DBEC. DBEC data table is only output if the experiment includes targeted mRNA or AbSeq bioproducts." + example: DBEC_MolsPerCell_MEX.zip + required: false + info: + template: "[sample_name]_DBEC_MolsPerCell_MEX.zip" + - name: "--rsec_mols_per_cell_unfiltered" + type: file + direction: output + description: "Unfiltered tables containing all cell labels with ≥10 reads." + example: RSEC_MolsPerCell_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_RSEC_MolsPerCell_Unfiltered_MEX.zip" + - name: "--bam" + type: file + direction: output + description: "Alignment file of R2 with associated R1 annotations for Bioproduct." + example: BioProduct.bam + required: false + info: + template: "[sample_name]_Bioproduct.bam" + - name: "--bam_index" + type: file + direction: output + description: "Index file for the alignment file." + example: BioProduct.bam.bai + required: false + info: + template: "[sample_name]_Bioproduct.bam.bai" + - name: "--bioproduct_stats" + type: file + direction: output + description: "Bioproduct Stats. Metrics from RSEC and DBEC Unique Molecular Identifier adjustment algorithms on a per-bioproduct basis." + example: Bioproduct_Stats.csv + required: false + info: + template: "[sample_name]_Bioproduct_Stats.csv" + - name: "--dimred_tsne" + type: file + direction: output + description: "t-SNE dimensionality reduction coordinates per cell index" + example: tSNE_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_tSNE_coordinates.csv" + - name: "--dimred_umap" + type: file + direction: output + description: "UMAP dimensionality reduction coordinates per cell index" + example: UMAP_coordinates.csv + required: false + info: + template: "[sample_name]_(assay)_UMAP_coordinates.csv" + - name: "--immune_cell_classification" + type: file + direction: output + description: "Immune Cell Classification. Cell type classification based on the expression of immune cell markers." + example: Immune_Cell_Classification.csv + required: false + info: + template: "[sample_name]_(assay)_cell_type_experimental.csv" + - name: Multiplex outputs + description: Outputs when multiplex option is selected + arguments: + - name: "--sample_tag_metrics" + type: file + direction: output + description: "Sample Tag Metrics. Metrics from the sample determination algorithm." + example: Sample_Tag_Metrics.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Metrics.csv" + - name: "--sample_tag_calls" + type: file + direction: output + description: "Sample Tag Calls. Assigned Sample Tag for each putative cell" + example: Sample_Tag_Calls.csv + required: false + info: + template: "[sample_name]_Sample_Tag_Calls.csv" + - name: "--sample_tag_counts" + type: file + direction: output + description: "Sample Tag Counts. Separate data tables and metric summary for cells assigned to each sample tag. Note: For putative cells that could not be assigned a specific Sample Tag, a Multiplet_and_Undetermined.zip file is also output." + example: Sample_Tag1.zip + required: false + multiple: true + info: + template: "[sample_name]_Sample_Tag[number].zip" + - name: "--sample_tag_counts_unassigned" + type: file + direction: output + description: "Sample Tag Counts Unassigned. Data table and metric summary for cells that could not be assigned a specific Sample Tag." + example: Multiplet_and_Undetermined.zip + required: false + info: + template: "[sample_name]_Multiplet_and_Undetermined.zip" + - name: VDJ Outputs + description: Outputs when VDJ option selected + arguments: + - name: "--vdj_metrics" + type: file + direction: output + description: "VDJ Metrics. Overall metrics from the VDJ analysis." + example: VDJ_Metrics.csv + required: false + info: + template: "[sample_name]_VDJ_Metrics.csv" + - name: "--vdj_per_cell" + type: file + direction: output + description: "VDJ Per Cell. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell.csv + required: false + info: + template: "[sample_name]_VDJ_perCell.csv" + - name: "--vdj_per_cell_uncorrected" + type: file + direction: output + description: "VDJ Per Cell Uncorrected. Cell specific read and molecule counts, VDJ gene segments, CDR3 sequences, paired chains, and cell type." + example: VDJ_perCell_uncorrected.csv + required: false + info: + template: "[sample_name]_VDJ_perCell_uncorrected.csv" + - name: "--vdj_dominant_contigs" + type: file + direction: output + description: "VDJ Dominant Contigs. Dominant contig for each cell label chain type combination (putative cells only)." + example: VDJ_Dominant_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Dominant_Contigs_AIRR.csv" + - name: "--vdj_unfiltered_contigs" + type: file + direction: output + description: "VDJ Unfiltered Contigs. All contigs that were assembled and annotated successfully (all cells)." + example: VDJ_Unfiltered_Contigs_AIRR.csv + required: false + info: + template: "[sample_name]_VDJ_Unfiltered_Contigs_AIRR.csv" + - name: "ATAC-Seq outputs" + description: Outputs when ATAC-Seq option selected + arguments: + - name: "--atac_metrics" + type: file + direction: output + description: "ATAC Metrics. Overall metrics from the ATAC-Seq analysis." + example: ATAC_Metrics.csv + required: false + info: + template: "[sample_name]_ATAC_Metrics.csv" + - name: "--atac_metrics_json" + type: file + direction: output + description: "ATAC Metrics JSON. Overall metrics from the ATAC-Seq analysis in JSON format." + example: ATAC_Metrics.json + required: false + info: + template: "[sample_name]_ATAC_Metrics.json" + - name: "--atac_fragments" + type: file + direction: output + description: "ATAC Fragments. Chromosomal location, cell index, and read support for each fragment detected" + example: ATAC_Fragments.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz" + - name: "--atac_fragments_index" + type: file + direction: output + description: "Index of ATAC Fragments." + example: ATAC_Fragments.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Fragments.bed.gz.tbi" + - name: "--atac_transposase_sites" + type: file + direction: output + description: "ATAC Transposase Sites. Chromosomal location, cell index, and read support for each transposase site detected" + example: ATAC_Transposase_Sites.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz" + - name: "--atac_transposase_sites_index" + type: file + direction: output + description: "Index of ATAC Transposase Sites." + example: ATAC_Transposase_Sites.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Transposase_Sites.bed.gz.tbi" + - name: "--atac_peaks" + type: file + direction: output + description: "ATAC Peaks. Peak regions of transposase activity" + example: ATAC_Peaks.bed.gz + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz" + - name: "--atac_peaks_index" + type: file + direction: output + description: "Index of ATAC Peaks." + example: ATAC_Peaks.bed.gz.tbi + required: false + info: + template: "[sample_name]_ATAC_Peaks.bed.gz.tbi" + - name: "--atac_peak_annotation" + type: file + direction: output + description: "ATAC Peak Annotation. Estimated annotation of peak-to-gene connections" + example: peak_annotation.tsv.gz + required: false + info: + template: "[sample_name]_peak_annotation.tsv.gz" + - name: "--atac_cell_by_peak" + type: file + direction: output + description: "ATAC Cell by Peak. Peak regions of transposase activity per cell" + example: ATAC_Cell_by_Peak_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_MEX.zip" + - name: "--atac_cell_by_peak_unfiltered" + type: file + direction: output + description: "ATAC Cell by Peak Unfiltered. Unfiltered file containing all cell labels with >=1 transposase sites in peaks." + example: ATAC_Cell_by_Peak_Unfiltered_MEX.zip + required: false + info: + template: "[sample_name]_ATAC_Cell_by_Peak_Unfiltered_MEX.zip" + - name: "--atac_bam" + type: file + direction: output + description: "ATAC BAM. Alignment file for R1 and R2 with associated I2 annotations for ATAC-Seq. Only output if the BAM generation flag is set to true." + example: ATAC.bam + required: false + info: + template: "[sample_name]_ATAC.bam" + - name: "--atac_bam_index" + type: file + direction: output + description: "Index of ATAC BAM." + example: ATAC.bam.bai + required: false + info: + template: "[sample_name]_ATAC.bam.bai" + - name: AbSeq Cell Calling outputs + description: Outputs when Cell Calling Abseq is selected + arguments: + - name: "--protein_aggregates_experimental" + type: file + direction: output + description: "Protein Aggregates Experimental" + example: Protein_Aggregates_Experimental.csv + required: false + info: + template: "[sample_name]_Protein_Aggregates_Experimental.csv" + - name: Putative Cell Calling Settings + arguments: + - name: "--cell_calling_data" + type: string + description: | + Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC + + For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. + + For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. + + The default data for putative cell calling, will be determined the following way: + + - If mRNA Reads and ATAC Reads exist: mRNA_and_ATAC + - If only ATAC Reads exist: ATAC + - Otherwise: mRNA + choices: [mRNA, AbSeq, ATAC, mRNA_and_ATAC] + example: mRNA + info: + config_key: Cell_Calling_Data + - name: "--cell_calling_bioproduct_algorithm" + type: string + description: | + Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_Bioproduct_Algorithm + - name: "--cell_calling_atac_algorithm" + type: string + description: | + Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined + + By default, the Basic algorithm will be used for putative cell calling. + choices: [Basic, Refined] + example: Basic + info: + config_key: Cell_Calling_ATAC_Algorithm + - name: "--exact_cell_count" + type: integer + description: | + Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count + example: 10000 + min: 1 + info: + config_key: Exact_Cell_Count + - name: "--expected_cell_count" + type: integer + description: | + Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. + example: 20000 + min: 1 + info: + config_key: Expected_Cell_Count + - name: Intronic Reads Settings + arguments: + - name: --exclude_intronic_reads + type: boolean + description: | + By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. + The value can be true or false. + example: false + info: + config_key: Exclude_Intronic_Reads + - name: Multiplex Settings + arguments: + - name: "--sample_tags_version" + type: string + description: | + Specify the version of the Sample Tags used in the run: + + * If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only + * If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. + * If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. + choices: [human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only] + example: human + info: + config_key: Sample_Tags_Version + - name: "--tag_names" + type: string + description: | + Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv + Do not use the special characters: &, (), [], {}, <>, ?, | + multiple: true + example: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + info: + config_key: Tag_Names + - name: VDJ arguments + arguments: + - name: "--vdj_version" + type: string + description: | + If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR + choices: [human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR] + example: human + info: + config_key: VDJ_Version + - name: ATAC options + arguments: + - name: "--predefined_atac_peaks" + type: file + description: An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. + example: predefined_peaks.bed + info: + config_key: Predefined_ATAC_Peaks + - name: Additional options + arguments: + - name: "--run_name" + type: string + description: | + Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. + default: sample + info: + config_key: Run_Name + - name: "--generate_bam" + type: boolean + description: | + Specify whether to create the BAM file output + default: false + info: + config_key: Generate_Bam + - name: "--long_reads" + type: boolean + description: | + Use STARlong (default: undefined - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. + info: + config_key: Long_Reads + - name: Advanced options + description: | + NOTE: Only change these if you are really sure about what you are doing + arguments: + - name: "--custom_star_params" + type: string + description: | + Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. + For reference this is the default that is used: + + Short Reads: `--outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000` + Long Reads: Same as Short Reads + `--seedPerReadNmax 10000` + + This applies to fastqs provided in the Reads user input + Do NOT set any non-mapping related params like `--genomeDir`, `--outSAMtype`, `--outSAMunmapped`, `--readFilesIn`, `--runThreadN`, etc. + We use STAR version 2.7.10b + example: "--alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000" + info: + config_key: Custom_STAR_Params + - name: "--custom_bwa_mem2_params" + type: string + description: | + Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline + The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used + This applies to fastqs provided in the Reads_ATAC user input + Do NOT set any non-mapping related params like `-C`, `-t`, etc. + We use bwa-mem2 version 2.2.1 + example: "-k 16 -w 200 -r" + info: + config_key: Custom_bwa_mem2_Params + - name: CWL-runner arguments + arguments: + - name: "--parallel" + type: boolean + description: "Run jobs in parallel." + default: true + - name: "--timestamps" + type: boolean_true + description: "Add timestamps to the errors, warnings, and notifications." + - name: Undocumented arguments + arguments: + - name: --abseq_umi + type: integer + multiple: false + info: + config_key: AbSeq_UMI + - name: --target_analysis + type: boolean + multiple: false + info: + config_key: Target_analysis + - name: --vdj_jgene_evalue + type: double + description: | + e-value threshold for J gene. The e-value threshold for J gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_JGene_Evalue + - name: --vdj_vgene_evalue + type: double + description: | + e-value threshold for V gene. The e-value threshold for V gene call by IgBlast/PyIR, default is set as 0.001 + multiple: false + info: + config_key: VDJ_VGene_Evalue + - name: --write_filtered_reads + type: boolean + multiple: false + info: + config_key: Write_Filtered_Reads +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: test.py + - path: ../test_data + - path: ../helpers + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps, git] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + mkdir /var/bd_rhapsody_cwl && \ + cd /var/bd_rhapsody_cwl && \ + git clone https://bitbucket.org/CRSwDev/cwl.git . && \ + git checkout 8feeace1141b24749ea6003f8e6ad6d3ad5232de + - type: docker + run: + - VERSION=$(ls -v /var/bd_rhapsody_cwl | grep '^v' | sed 's#v##' | tail -1) + - 'echo "bdgenomics/rhapsody: \"$VERSION\"" > /var/software_versions.txt' + test_setup: + - type: python + packages: [biopython, gffutils] +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt new file mode 100644 index 00000000..618faa3e --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/help.txt @@ -0,0 +1,167 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_sequence_analysis/rhapsody_pipeline_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_sequence_analysis/rhapsody_pipeline_2.2.1_nodocker.cwl + [-h] [--AbSeq_Reference ABSEQ_REFERENCE] [--AbSeq_UMI ABSEQ_UMI] + [--Cell_Calling_ATAC_Algorithm CELL_CALLING_ATAC_ALGORITHM] + [--Cell_Calling_Bioproduct_Algorithm CELL_CALLING_BIOPRODUCT_ALGORITHM] + [--Cell_Calling_Data CELL_CALLING_DATA] + [--Custom_STAR_Params CUSTOM_STAR_PARAMS] + [--Custom_bwa_mem2_Params CUSTOM_BWA_MEM2_PARAMS] + [--Exact_Cell_Count EXACT_CELL_COUNT] [--Exclude_Intronic_Reads] + [--Expected_Cell_Count EXPECTED_CELL_COUNT] [--Generate_Bam] + [--Long_Reads] [--Maximum_Threads MAXIMUM_THREADS] + [--Predefined_ATAC_Peaks PREDEFINED_ATAC_PEAKS] [--Reads READS] + [--Reads_ATAC READS_ATAC] [--Reference_Archive REFERENCE_ARCHIVE] + [--Run_Name RUN_NAME] [--Sample_Tags_Version SAMPLE_TAGS_VERSION] + [--Supplemental_Reference SUPPLEMENTAL_REFERENCE] + [--Tag_Names TAG_NAMES] [--Target_analysis] + [--Targeted_Reference TARGETED_REFERENCE] + [--VDJ_JGene_Evalue VDJ_JGENE_EVALUE] + [--VDJ_VGene_Evalue VDJ_VGENE_EVALUE] [--VDJ_Version VDJ_VERSION] + [--Write_Filtered_Reads] + [job_order] + +The BD Rhapsody™ assays are used to create sequencing libraries from single +cell transcriptomes. After sequencing, the analysis pipeline takes the FASTQ +files and a reference file for gene alignment. The pipeline generates +molecular counts per cell, read counts per cell, metrics, and an alignment +file. + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --AbSeq_Reference ABSEQ_REFERENCE + AbSeq Reference + --AbSeq_UMI ABSEQ_UMI + --Cell_Calling_ATAC_Algorithm CELL_CALLING_ATAC_ALGORITHM + Specify the ATAC algorithm to be used for ATAC + putative cell calling. The Basic algorithm is the + default. + --Cell_Calling_Bioproduct_Algorithm CELL_CALLING_BIOPRODUCT_ALGORITHM + Specify the bioproduct algorithm to be used for + mRNA/AbSeq putative cell calling. The Basic algorithm + is the default. + --Cell_Calling_Data CELL_CALLING_DATA + Specify the data to be used for putative cell calling. + The default data for putative cell calling will be + determined the following way: - If mRNA and ATAC Reads + exist, mRNA_and_ATAC is the default. - If only ATAC + Reads exist, ATAC is the default. - Otherwise, mRNA is + the default. + --Custom_STAR_Params CUSTOM_STAR_PARAMS + Allows you to specify custom STAR aligner mapping + parameters. Only the mapping parameters you provide + here will be used with STAR, meaning that you must + provide the complete list of parameters that you want + to take effect. For reference, the parameters used by + default in the pipeline are: 1. Short Reads: + --outFilterScoreMinOverLread 0 + --outFilterMatchNminOverLread 0 + --outFilterMultimapScoreRange 0 --clip3pAdapterSeq + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + --seedSearchStartLmax 50 --outFilterMatchNmin 25 + --limitOutSJcollapsed 2000000 2. Long Reads: Same + options as short reads + --seedPerReadNmax 10000 + Example input: --alignIntronMax 500000 + --outFilterScoreMinOverLread 0 --limitOutSJcollapsed + 2000000 Important: 1. This applies to fastqs provided + in the Reads user input 2. Please do not specify any + non-mapping related params like: --runThreadN, + --genomeDir --outSAMtype, etc. 3. Please only use + params supported by STAR version 2.7.10b + --Custom_bwa_mem2_Params CUSTOM_BWA_MEM2_PARAMS + Allows you to specify custom bwa-mem2 mapping + parameters. Only the mapping parameters you provide + here will be used with bwa-mem2, meaning that you must + provide the complete list of parameters that you want + to take effect. The pipeline uses program default + mapping parameters. Example input: -k 15 -w 200 -r 2 + Important: 1. This applies to fastqs provided in the + Reads_ATAC user input 2. Please do not specify any + non-mapping related params like: -C, -t, etc. 3. + Please only use params supported by bwa-mem2 version + 2.2.1 + --Exact_Cell_Count EXACT_CELL_COUNT + Set a specific number (>=1) of cells as putative, + based on those with the highest error-corrected read + count + --Exclude_Intronic_Reads + By default, reads aligned to exons and introns are + considered and represented in molecule counts. + Including intronic reads may increase sensitivity, + resulting in an increase in molecule counts and the + number of genes per cell for both cellular and nuclei + samples. Intronic reads may indicate unspliced mRNAs + and are also useful, for example, in the study of + nuclei and RNA velocity. When set to true, intronic + reads will be excluded. + --Expected_Cell_Count EXPECTED_CELL_COUNT + Optional. Guide the basic putative cell calling + algorithm by providing an estimate of the number of + cells expected. Usually this can be the number of + cells loaded into the Rhapsody cartridge. If there are + multiple inflection points on the second derivative + cumulative curve, this will ensure the one selected is + near the expected. + --Generate_Bam Default: false. A Bam read alignment file contains + reads from all the input libraries, but creating it + can consume a lot of compute and disk resources. By + setting this field to true, the Bam file will be + created. This option is shared for both Bioproduct and + ATAC libraries. + --Long_Reads By default, we detect if there are any reads longer + than 650bp and then flag QualCLAlign to use STARlong + instead of STAR. This flag can be explicitly set if it + is known in advance that there are reads longer than + 650bp. + --Maximum_Threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Predefined_ATAC_Peaks PREDEFINED_ATAC_PEAKS + An optional BED file containing pre-established + chromatin accessibility peak regions for generating + the ATAC cell-by-peak matrix. Only applies to ATAC + assays. + --Reads READS FASTQ files from libraries that may include WTA mRNA, + Targeted mRNA, AbSeq, Sample Multiplexing, and related + technologies + --Reads_ATAC READS_ATAC + FASTQ files from libraries generated using the ATAC + assay protocol. Each lane of a library is expected to + have 3 FASTQs - R1, R2 and I1/I2, where the index read + contains the Cell Barcode and UMI sequence. Only + applies to ATAC assays. + --Reference_Archive REFERENCE_ARCHIVE + Reference Files Archive + --Run_Name RUN_NAME This is a name for output files, for example + Experiment1_Metrics_Summary.csv. Default if left empty + is to name run based on a library. Any non-alpha + numeric characters will be changed to a hyphen. + --Sample_Tags_Version SAMPLE_TAGS_VERSION + The sample multiplexing kit version. This option + should only be set for a multiplexed experiment. + --Supplemental_Reference SUPPLEMENTAL_REFERENCE + Supplemental Reference + --Tag_Names TAG_NAMES + Specify the Sample Tag number followed by - (hyphen) + and a sample name to appear in the output files. For + example: 4-Ramos. Should be alpha numeric, with + - + and _ allowed. Any special characters: &, (), [], {}, + <>, ?, | will be corrected to underscores. + --Target_analysis + --Targeted_Reference TARGETED_REFERENCE + Targeted Reference + --VDJ_JGene_Evalue VDJ_JGENE_EVALUE + The e-value threshold for J gene call by IgBlast/PyIR, + default is set as 0.001 + --VDJ_VGene_Evalue VDJ_VGENE_EVALUE + The e-value threshold for V gene call by IgBlast/PyIR, + default is set as 0.001 + --VDJ_Version VDJ_VERSION + The VDJ species and chain types. This option should + only be set for VDJ experiment. + --Write_Filtered_Reads diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml new file mode 100644 index 00000000..19728a57 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/pipeline_inputs_template_2.2.1.yaml @@ -0,0 +1,203 @@ +#!/usr/bin/env cwl-runner + +cwl:tool: rhapsody + +# This is a template YML file used to specify the inputs for a BD Rhapsody Sequence Analysis pipeline run. +# See the BD Rhapsody Sequence Analysis Pipeline User Guide for more details. Enter the following information: + + +## Reads (optional) - Path to your FASTQ.GZ formatted read files from libraries that may include: +# - WTA mRNA +# - Targeted mRNA +# - AbSeq +# - Sample Multiplexing +# - VDJ +# You may specify as many R1/R2 read pairs as you want. +Reads: + + - class: File + location: "test/WTALibrary_S1_L001_R1_001.fastq.gz" + + - class: File + location: "test/WTALibrary_S1_L001_R2_001.fastq.gz" + +## Reads_ATAC (optional) - Path to your FASTQ.GZ formatted read files from ATAC-Seq libraries. +## You may specify as many R1/R2/I2 files as you want. +Reads_ATAC: + + - class: File + location: "test/ATACLibrary_S2_L001_R1_001.fastq.gz" + + - class: File + location: "test/ATACLibrary_S2_L001_R2_001.fastq.gz" + + - class: File + location: "test/ATACLibrary_S2_L001_I2_001.fastq.gz" + + +## Assay type will be inferred from the provided reference(s) +## Do not provide both Reference_Archive and Targeted_Reference at the same time +## +## Valid reference input combinations: +## WTA Reference_Archive (WTA only) +## WTA Reference_Archive + AbSeq_Reference (WTA + AbSeq) +## WTA Reference_Archive + Supplemental_Reference (WTA + extra transgenes) +## WTA Reference_Archive + AbSeq_Reference + Supplemental_Reference (WTA + AbSeq + extra transgenes) +## WTA+ATAC-Seq Reference_Archive (WTA + ATAC, ATAC only) +## WTA+ATAC-Seq Reference_Archive + Supplemental_Reference (WTA + ATAC + extra transgenes) +## Targeted_Reference (Targeted only) +## Targeted_Reference + AbSeq_Reference (Targeted + AbSeq) +## AbSeq_Reference (AbSeq only) + +## See the BD Rhapsody Sequence Analysis Pipeline User Guide for instructions on how to: +## - Obtain a pre-built Rhapsody Reference file +## - Create a custom Rhapsody Reference file + +## WTA Reference_Archive (required for WTA mRNA assay) - Path to Rhapsody WTA Reference in the tar.gz format. +## +## --Structure of reference archive-- +## BD_Rhapsody_Reference_Files/ # top level folder +## star_index/ # sub-folder containing STAR index +## [files created with STAR --runMode genomeGenerate] +## [GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf"] +## +## WTA+ATAC-Seq Reference_Archive (required for ATAC-Seq or Multiomic ATAC-Seq (WTA+ATAC-Seq) assays) - Path to Rhapsody WTA+ATAC-Seq Reference in the tar.gz format. +## +## --Structure of reference archive-- +## BD_Rhapsody_Reference_Files/ # top level folder +## star_index/ # sub-folder containing STAR index +## [files created with STAR --runMode genomeGenerate] +## [GTF for gene-transcript-annotation e.g. "gencode.v43.primary_assembly.annotation.gtf"] +## +## mitochondrial_contigs.txt # mitochondrial contigs in the reference genome - one contig name per line. e.g. chrMT or chrM, etc. +## +## bwa-mem2_index/ # sub-folder containing bwa-mem2 index +## [files created with bwa-mem2 index] +## +Reference_Archive: + class: File + location: "test/RhapRef_Human_WTA_2023-02.tar.gz" +# location: "test/RhapRef_Human_WTA-ATAC_2023-08.tar.gz" + +## Targeted_Reference (required for Targeted mRNA assay) - Path to the targeted reference file in FASTA format. +#Targeted_Reference: +# - class: File +# location: "test/BD_Rhapsody_Immune_Response_Panel_Hs.fasta" + +## AbSeq_Reference (optional) - Path to the AbSeq reference file in FASTA format. Only needed if BD AbSeq Ab-Oligos are used. +## For putative cell calling using an AbSeq dataset, please provide an AbSeq reference fasta file as the AbSeq_Reference. +#AbSeq_Reference: +# - class: File +# location: "test/AbSeq_reference.fasta" + +## Supplemental_Reference (optional) - Path to the supplemental reference file in FASTA format. Only needed if there are additional transgene sequences to be aligned against in a WTA assay experiment +#Supplemental_Reference: +# - class: File +# location: "test/supplemental_reference.fasta" + +#################################### +## Putative Cell Calling Settings ## +#################################### + +## Putative cell calling dataset (optional) - Specify the dataset to be used for putative cell calling: mRNA, AbSeq, ATAC, mRNA_and_ATAC +## For putative cell calling using an AbSeq dataset, please provide an AbSeq_Reference fasta file above. +## For putative cell calling using an ATAC dataset, please provide a WTA+ATAC-Seq Reference_Archive file above. +## The default data for putative cell calling, will be determined the following way: +## If mRNA Reads and ATAC Reads exist: +## Cell_Calling_Data: mRNA_and_ATAC +## If only ATAC Reads exist: +## Cell_Calling_Data: ATAC +## Otherwise: +## Cell_Calling_Data: mRNA +#Cell_Calling_Data: mRNA + +## Putative cell calling bioproduct algorithm (optional) - Specify the bioproduct algorithm to be used for putative cell calling: Basic or Refined +## By default, the Basic algorithm will be used for putative cell calling. +#Cell_Calling_Bioproduct_Algorithm: Basic + +## Putative cell calling ATAC algorithm (optional) - Specify the ATAC-seq algorithm to be used for putative cell calling: Basic or Refined +## By default, the Basic algorithm will be used for putative cell calling. +#Cell_Calling_ATAC_Algorithm: Basic + +## Exact cell count (optional) - Set a specific number (>=1) of cells as putative, based on those with the highest error-corrected read count +#Exact_Cell_Count: 10000 + +## Expected Cell Count (optional) - Guide the basic putative cell calling algorithm by providing an estimate of the number of cells expected. Usually this can be the number of cells loaded into the Rhapsody cartridge. If there are multiple inflection points on the second derivative cumulative curve, this will ensure the one selected is near the expected. +#Expected_Cell_Count: 20000 + + +#################################### +## Intronic Reads Settings ## +#################################### + +## Exclude_Intronic_Reads (optional) +## By default, the flag is false, and reads aligned to exons and introns are considered and represented in molecule counts. When the flag is set to true, intronic reads will be excluded. +## The value can be true or false. +#Exclude_Intronic_Reads: true + +####################### +## Multiplex options ## +####################### + +## Sample Tags Version (optional) - If Sample Tag Multiplexing was done, specify the appropriate version: human, mouse, flex, nuclei_includes_mrna, nuclei_atac_only +## If this is an SMK + Nuclei mRNA run or an SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq) run (and not an SMK + ATAC-Seq only run), choose the "nuclei_includes_mrna" option. +## If this is an SMK + ATAC-Seq only run (and not SMK + Multiomic ATAC-Seq (WTA+ATAC-Seq)), choose the "nuclei_atac_only" option. +#Sample_Tags_Version: human + +## Tag_Names (optional) - Specify the tag number followed by '-' and the desired sample name to appear in Sample_Tag_Metrics.csv +# Do not use the special characters: &, (), [], {}, <>, ?, | +#Tag_Names: [4-mySample, 9-myOtherSample, 6-alsoThisSample] + +################ +## VDJ option ## +################ + +## VDJ Version (optional) - If VDJ was done, specify the appropriate option: human, mouse, humanBCR, humanTCR, mouseBCR, mouseTCR +#VDJ_Version: human + +################## +## ATAC options ## +################## + +## Predefined ATAC Peaks - An optional BED file containing pre-established chromatin accessibility peak regions for generating the ATAC cell-by-peak matrix. +#Predefined_ATAC_Peaks: +# class: File +# location: "path/predefined_peaks.bed" + +######################## +## Additional Options ## +######################## + +## Run Name (optional)- Specify a run name to use as the output file base name. Use only letters, numbers, or hyphens. Do not use special characters or spaces. +#Run_Name: my-experiment + +## Generate Bam (optional, default: false) - Specify whether to create the BAM file output +#Generate_Bam: true + +## Maximum_Threads (integer, optional, default: [use all cores of CPU]) - Set the maximum number of threads to use in the read processing steps of the pipeline: QualCLAlign, AlignmentAnalysis, VDJ assembly +#Maximum_Threads: 16 + +## Use STARlong (optional, default: "auto" - i.e. autodetects based on read lengths) - Specify if the STARlong aligner should be used instead of STAR. Set to true if the reads are longer than 650bp. +## The value can be true or false. +#Long_Reads: true + +######################## +## Advanced Options ## +######################## +## NOTE: Only change these if you are really sure about what you are doing + +## Modify STAR alignment parameters - Set this parameter to fully override default STAR mapping parameters used in the pipeline. +## For reference this is the default that is used: +## Short Reads: --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outFilterMultimapScoreRange 0 --clip3pAdapterSeq AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA --seedSearchStartLmax 50 --outFilterMatchNmin 25 --limitOutSJcollapsed 2000000 +## Long Reads: Same as Short Reads + --seedPerReadNmax 10000 +## This applies to fastqs provided in the Reads user input +## Do NOT set any non-mapping related params like --genomeDir, --outSAMtype, --outSAMunmapped, --readFilesIn, --runThreadN, etc. +## We use STAR version 2.7.10b +#Custom_STAR_Params: --alignIntronMax 6000 --outFilterScoreMinOverLread 0.1 --limitOutSJcollapsed 2000000 + +## Modify bwa-mem2 alignment parameters - Set this parameter to fully override bwa-mem2 mapping parameters used in the pipeline +## The pipeline does not specify any custom mapping params to bwa-mem2 so program default values are used +## This applies to fastqs provided in the Reads_ATAC user input +## Do NOT set any non-mapping related params like -C, -t, etc. +## We use bwa-mem2 version 2.2.1 +#Custom_bwa_mem2_Params: -k 16 -w 200 -r diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py new file mode 100644 index 00000000..cbddf6bf --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/script.py @@ -0,0 +1,243 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil +import glob + +## VIASH START +par = { + 'reads': [ + 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz', + 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz' + ], + 'reads_atac': None, + 'reference_archive': "resources_test/reference_gencodev41_chr1/reference_bd_rhapsody.tar.gz", + 'targeted_reference': [], + 'abseq_reference': [], + 'supplemental_reference': [], + 'output': 'output_dir', + 'cell_calling_data': None, + 'cell_calling_bioproduct_algorithm': None, + 'cell_calling_atac_algorithm': None, + 'exact_cell_count': None, + 'expected_cell_count': None, + 'exclude_intronic_reads': None, + 'sample_tags_version': None, + 'tag_names': [], + 'vdj_version': None, + 'predefined_atac_peaks': None, + 'run_name': "sample", + 'generate_bam': None, + 'alignment_star_params': None, + 'alignment_bwa_mem2_params': None, + 'parallel': True, + 'timestamps': False, + 'dryrun': False +} +meta = { + 'config': "target/nextflow/bd_rhaspody/bd_rhaspody_sequence_analysis/.config.vsh.yaml", + 'resources_dir': os.path.abspath('src/bd_rhaspody/bd_rhaspody_sequence_analysis'), + 'temp_dir': os.getenv("VIASH_TEMP"), + 'memory_mb': None, + 'cpus': None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = argument["name"].lstrip("-") + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, 'r') as f: + config = yaml.safe_load(f) + + config["arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub('(\n?)[ \t]*\|', '\\1', text) + +def process_params(par: dict[str, Any], config, temp_dir: str) -> str: + # check input parameters + assert par["reads"] or par["reads_atac"], "Pass at least one set of inputs to --reads or --reads_atac." + + # output to temp dir if output_dir was not passed + if not par["output_dir"]: + par["output_dir"] = os.path.join(temp_dir, "output") + + # checking sample prefix + if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): + print("--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", flush=True) + par["run_name"] = re.sub("[^A-Za-z0-9\\-]", "-", par["run_name"]) + + # make paths absolute + for argument in config["arguments"]: + arg_clean_name = argument["clean_name"] + if not par[arg_clean_name] or not argument["type"] == "file": + continue + par_value = par[arg_clean_name] + if isinstance(par_value, list): + par_value_absolute = list(map(os.path.abspath, par_value)) + else: + par_value_absolute = os.path.abspath(par_value) + par[arg_clean_name] = par_value_absolute + + return par + +def generate_config(par: dict[str, Any], config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |cwl:tool: rhapsody + |""")] + + for argument in config["arguments"]: + arg_clean_name = argument["clean_name"] + arg_par_value = par[arg_clean_name] + arg_info = argument.get("info") or {} # Note: .info might be None + config_key = arg_info.get("config_key") + if arg_par_value and config_key: + + if argument["type"] == "file": + content = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(arg_par_value, list): + for file in arg_par_value: + content += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + content += strip_margin(f"""\ + | class: File + | location: "{arg_par_value}" + |""") + content_list.append(content) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {arg_par_value} + |""")) + + ## Write config to file + return ''.join(content_list) + +def generate_config_file(par: dict[str, Any], config: dict[str, Any], temp_dir: str) -> str: + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, config) + with open(config_file, "w") as f: + f.write(config_content) + return config_file + +def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: + # create cwl file (if need be) + # orig_cwl_file=os.path.join(meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl") + orig_cwl_file="/var/bd_rhapsody_cwl/v2.2.1/rhapsody_pipeline_2.2.1.cwl" + + if not meta["memory_mb"] and not meta["cpus"]: + return os.path.abspath(orig_cwl_file) + + # Inject computational requirements into pipeline + cwl_file = os.path.join(dir, "pipeline.cwl") + + # Read in the file + with open(orig_cwl_file, 'r') as file : + cwl_data = file.read() + + # Inject computational requirements into pipeline + if meta["memory_mb"]: + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub('"ramMin": [^\n]*[^,](,?)\n', f'"ramMin": {memory}\\1\n', cwl_data) + if meta["cpus"]: + cwl_data = re.sub('"coresMin": [^\n]*[^,](,?)\n', f'"coresMin": {meta["cpus"]}\\1\n', cwl_data) + + # Write the file out again + with open(cwl_file, 'w') as file: + file.write(cwl_data) + + return os.path.abspath(cwl_file) + +def copy_outputs(par: dict[str, Any], config: dict[str, Any]): + for arg in config["arguments"]: + par_value = par[arg["clean_name"]] + if par_value and arg["type"] == "file" and arg["direction"] == "output": + # example template: '[sample_name]_(assay)_cell_type_experimental.csv' + template = (arg.get("info") or {}).get("template") # Note: .info might be None + if template: + template_glob = template\ + .replace("[sample_name]", par["run_name"])\ + .replace("(assay)", "*")\ + .replace("[number]", "*") + files = glob.glob(os.path.join(par["output_dir"], template_glob)) + if not files and arg["required"]: + raise ValueError(f"Expected output file '{template_glob}' not found.") + elif len(files) > 1 and not arg["multiple"]: + raise ValueError(f"Expected single output file '{template_glob}', but found multiple.") + + if not arg["multiple"]: + shutil.copy(files[0], par_value) + else: + # replace '*' in par_value with index + for i, file in enumerate(files): + shutil.copy(file, par_value.replace("*", str(i))) + + +def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config, temp_dir) + + ## Process parameters + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", par["output_dir"], + ] + + if par["parallel"]: + cmd.append("--parallel") + + if par["timestamps"]: + cmd.append("--timestamps") + + # Create cwl file (if need be) + cwl_file = generate_cwl_file(meta, temp_dir) + cmd.append(cwl_file) + + # Create params file + config_file = generate_config_file(par, config, temp_dir) + cmd.append(config_file) + + # keep environment variables but set TMPDIR to temp_dir + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + # Create output dir if not exists + if not os.path.exists(par["output_dir"]): + os.makedirs(par["output_dir"]) + + # Run command + print("> " + ' '.join(cmd), flush=True) + _ = subprocess.run( + cmd, + cwd=os.path.dirname(config_file), + env=env, + check=True + ) + + # Copy outputs + copy_outputs(par, config) + +if __name__ == "__main__": + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"]) as temp_dir: + main(par, meta, temp_dir) diff --git a/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py new file mode 100644 index 00000000..aed8e80b --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_sequence_analysis/test.py @@ -0,0 +1,494 @@ +import subprocess +import gzip +from pathlib import Path +from typing import Tuple +import numpy as np +import random +import mudata as md + +## VIASH START +meta = { + "name": "bd_rhapsody_sequence_analysis", + "executable": "target/docker/bd_rhapsody/bd_rhapsody_sequence_analysis/bd_rhapsody_sequence_analysis", + "resources_dir": "src/bd_rhapsody", + "cpus": 8, + "memory_mb": 4096, +} +## VIASH END + +import sys +sys.path.append(meta["resources_dir"]) + +from helpers.rhapsody_cell_label import index_to_sequence + +meta["executable"] = Path(meta["executable"]) +meta["resources_dir"] = Path(meta["resources_dir"]) + +######################################################################################### + +# Generate index +print("> Generate index", flush=True) +# cwl_file = meta["resources_dir"] / "bd_rhapsody_make_reference.cwl" +cwl_file = "/var/bd_rhapsody_cwl/v2.2.1/Extra_Utilities/make_rhap_reference_2.2.1.cwl" +reference_small_gtf = meta["resources_dir"] / "test_data" / "reference_small.gtf" +reference_small_fa = meta["resources_dir"] / "test_data" / "reference_small.fa" +bdabseq_panel_fa = meta["resources_dir"] / "test_data" / "BDAbSeq_ImmuneDiscoveryPanel.fasta" +sampletagsequences_fa = meta["resources_dir"] / "test_data" / "SampleTagSequences_HomoSapiens_ver1.fasta" + +config_file = Path("reference_config.yml") +reference_file = Path("Rhap_reference.tar.gz") + +subprocess.run([ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + ".", + str(cwl_file), + "--Genome_fasta", + str(reference_small_fa), + "--Gtf", + str(reference_small_gtf), + "--Extra_STAR_params", + "--genomeSAindexNbases 4" +]) + +######################################################################################### +# Load reference in memory + +from Bio import SeqIO +import gffutils + +# Load FASTA sequence +with open(str(reference_small_fa), "r") as handle: + reference_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) +with open(str(bdabseq_panel_fa), "r") as handle: + bdabseq_panel_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) +with open(str(sampletagsequences_fa), "r") as handle: + sampletagsequences_fasta_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) + +# create in memory db +reference_gtf_db = gffutils.create_db( + str(reference_small_gtf), + dbfn=":memory:", + force=True, + keep_order=True, + merge_strategy="merge", + sort_attribute_values=True, + disable_infer_transcripts=True, + disable_infer_genes=True +) + +############################################# +# TODO: move helper functions to separate helper file + + +def generate_bd_read_metadata( + instrument_id: str = "A00226", + run_id: str = "970", + flowcell_id: str = "H5FGVMXY", + lane: int = 1, + tile: int = 1101, + x: int = 1000, + y: int = 1000, + illumina_flag: str = "1:N:0", + sample_id: str = "CAGAGAGG", +) -> str: + """ + Generate a FASTQ metadata line for a BD Rhapsody FASTQ file. + + Args: + instrument_id: The instrument ID. + run_id: The run ID. + flowcell_id: The flowcell ID. + lane: The lane number. + tile: The tile number. Between 1101 and 1112 in the used example data. + x: The x-coordinate. Between 1000 and 32967 in the used example data. + y: The y-coordinate. Between 1000 and 37059 in the used example data. + illumina_flag: The Illumina flag. Either 1:N:0 or 2:N:0 in the used example data. + sample_id: The sample ID. + """ + # format: @A00226:970:H5FGVDMXY:1:1101:2645:1000 2:N:0:CAGAGAGG + return f"@{instrument_id}:{run_id}:{flowcell_id}:{lane}:{tile}:{x}:{y} {illumina_flag}:{sample_id}" + + +def generate_bd_wta_transcript( + transcript_length: int = 42, +) -> str: + """ + Generate a WTA transcript from a given GTF and FASTA file. + """ + + # Randomly select a gene + gene = random.choice(list(reference_gtf_db.features_of_type("gene"))) + + # Find all exons within the gene + exons = list(reference_gtf_db.children(gene, featuretype="exon", order_by="start")) + + # Calculate total exon length + total_exon_length = sum(exon.end - exon.start + 1 for exon in exons) + + # If total exon length is less than desired transcript length, use it as is + max_transcript_length = min(total_exon_length, transcript_length) + + # Build the WTA transcript sequence + sequence = "" + for exon in exons: + exon_seq = str(reference_fasta_dict[exon.seqid].seq[exon.start - 1 : exon.end]) + sequence += exon_seq + + # Break if desired length is reached + if len(sequence) >= max_transcript_length: + sequence = sequence[:max_transcript_length] + break + + # add padding if need be + if len(sequence) < max_transcript_length: + sequence += "N" * (max_transcript_length - len(sequence)) + + return sequence + + +def generate_bd_wta_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 42, +) -> Tuple[str, str]: + """ + Generate a BD Rhapsody WTA read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + + More info: + + See structure of reads: + - https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/top_steps.html + - https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/steps_cell_label.html + - https://scomix.bd.com/hc/en-us/articles/360057714812-All-FAQ + R1 is Cell Label + UMI + PolyT -> 60 bp + actually, CLS1 + "GTGA" + CLS2 + "GACA" + CLS3 + UMI + R2 is the actual read -> 42 bp + + Example R1 + CLS1 Link CLS2 Link CLS3 UMI + AAAATCCTGT GTGA AACCAAAGT GACA GATAGAGGAG CGCATGTTTATAAC + """ + + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A00226" + run_id = "970" + flowcell_id = "H5FGVMXY" + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by extracting sequence from fasta and gtf + wta_transcript = generate_bd_wta_transcript(transcript_length=transcript_length) + quality_r2 = "I" * transcript_length + r2 = f"{meta_r2}\n{wta_transcript}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_wta_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody WTA FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_wta_read(cell_index) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +def generate_bd_abc_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 72, +) -> Tuple[str, str]: + """ + Generate a BD Rhapsody ABC read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + """ + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A01604" + run_id = "19" + flowcell_id = "HMKLYDRXY" + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by sampling sequence from bdabseq_panel_fa + abseq_seq = str(random.choice(list(bdabseq_panel_fasta_dict.values())).seq) + abc_suffix = "AAAAAAAAAAAAAAAAAAAAAAA" + abc_data = abseq_seq[:transcript_length - len(abc_suffix) - 1] + abc_prefix = "N" + "".join(random.choices("ACGT", k=transcript_length - len(abc_data) - len(abc_suffix) - 1)) + + abc_transcript = f"{abc_prefix}{abc_data}{abc_suffix}" + + quality_r2 = "#" + "I" * (len(abc_transcript) - 1) + r2 = f"{meta_r2}\n{abc_transcript}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_abc_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody ABC FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_abc_read(cell_index) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +def generate_bd_smk_read( + cell_index: int = 0, + bead_version: str = "EnhV2", + umi_length: int = 14, + transcript_length: int = 72, + num_sample_tags: int = 3, +): + """ + Generate a BD Rhapsody SMK read pair for a given cell index. + + Args: + cell_index: The cell index to generate reads for. + bead_version: The bead version to use for generating the cell label. + umi_length: The length of the UMI to generate. + transcript_length: The length of the transcript to generate + num_sample_tags: The number of sample tags to use + + Returns: + A tuple of two strings, the first string being the R1 read and the second string being the R2 read. + """ + # generate metadata + per_row = np.floor((32967 - 1000) / 9) + per_col = np.floor((37059 - 1000) / 9) + + assert cell_index >= 0 and cell_index < per_row * per_col, f"cell_index must be between 0 and {per_row} * {per_col}" + x = 1000 + (cell_index % per_row) * 9 + y = 1000 + (cell_index // per_row) * 9 + instrument_id = "A00226" + run_id = "970" + flowcell_id = "H5FGVDMXY" + + meta_r1 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="1:N:0") + meta_r2 = generate_bd_read_metadata(instrument_id=instrument_id, run_id=run_id, flowcell_id=flowcell_id, x=x, y=y, illumina_flag="2:N:0") + + # generate r1 (cls1 + link + cls2 + link + cls3 + umi) + assert cell_index >= 0 and cell_index < 384 * 384 * 384 + cell_label = index_to_sequence(cell_index + 1, bead_version=bead_version) + # sample random umi + umi = "".join(random.choices("ACGT", k=umi_length)) + quality_r1 = "I" * (len(cell_label) + len(umi)) + r1 = f"{meta_r1}\n{cell_label}{umi}\n+\n{quality_r1}\n" + + # generate r2 by selecting the cell_index %% num_sample_tags sample tags + sampletag_index = cell_index % num_sample_tags + sampletag_seq = str(list(sampletagsequences_fasta_dict.values())[sampletag_index].seq) + smk_data = sampletag_seq[:transcript_length] + smk_suffix = "A" * (transcript_length - len(smk_data)) + quality_r2 = "I" * len(smk_data) + "#" * len(smk_suffix) + r2 = f"{meta_r2}\n{smk_data}{smk_suffix}\n+\n{quality_r2}\n" + + return r1, r2 + +def generate_bd_smk_fastq_files( + num_cells: int = 100, + num_reads_per_cell: int = 1000, + num_sample_tags: int = 3, +) -> Tuple[str, str]: + """ + Generate BD Rhapsody SMK FASTQ files for a given number of cells and transcripts per cell. + + Args: + num_cells: The number of cells to generate + num_reads_per_cell: The number of reads to generate per cell + num_sample_tags: The number of sample tags to use + + Returns: + A tuple of two strings, the first string being the R1 reads and the second string being the R2 reads. + """ + r1_reads = "" + r2_reads = "" + for cell_index in range(num_cells): + for _ in range(num_reads_per_cell): + r1, r2 = generate_bd_smk_read(cell_index, num_sample_tags=num_sample_tags) + r1_reads += r1 + r2_reads += r2 + + return r1_reads, r2_reads + +######################################################################################### + +# Prepare WTA, ABC, and SMK test data +print("> Prepare WTA test data", flush=True) +wta_reads_r1_str, wta_reads_r2_str = generate_bd_wta_fastq_files(num_cells=100, num_reads_per_cell=1000) +with gzip.open("WTAreads_R1.fq.gz", "wt") as f: + f.write(wta_reads_r1_str) +with gzip.open("WTAreads_R2.fq.gz", "wt") as f: + f.write(wta_reads_r2_str) + +print("> Prepare ABC test data", flush=True) +abc_reads_r1_str, abc_reads_r2_str = generate_bd_abc_fastq_files(num_cells=100, num_reads_per_cell=1000) +with gzip.open("ABCreads_R1.fq.gz", "wt") as f: + f.write(abc_reads_r1_str) +with gzip.open("ABCreads_R2.fq.gz", "wt") as f: + f.write(abc_reads_r2_str) + +print("> Prepare SMK test data", flush=True) +smk_reads_r1_str, smk_reads_r2_str = generate_bd_smk_fastq_files(num_cells=100, num_reads_per_cell=1000, num_sample_tags=3) +with gzip.open("SMKreads_R1.fq.gz", "wt") as f: + f.write(smk_reads_r1_str) +with gzip.open("SMKreads_R2.fq.gz", "wt") as f: + f.write(smk_reads_r2_str) + +######################################################################################### + +# Run executable +print(f">> Run {meta['name']}", flush=True) +output_dir = Path("output") +subprocess.run([ + meta['executable'], + "--reads=WTAreads_R1.fq.gz;WTAreads_R2.fq.gz", + f"--reference_archive={reference_file}", + "--reads=ABCreads_R1.fq.gz;ABCreads_R2.fq.gz", + f"--abseq_reference={bdabseq_panel_fa}", + "--reads=SMKreads_R1.fq.gz;SMKreads_R2.fq.gz", + "--tag_names=1-Sample1;2-Sample2;3-Sample3", + "--sample_tags_version=human", + "--output_dir=output", + "--exact_cell_count=100", + f"---cpus={meta['cpus'] or 1}", + f"---memory={meta['memory_mb'] or 2048}mb", + # "--output_seurat=seurat.rds", + "--output_mudata=mudata.h5mu", + "--metrics_summary=metrics_summary.csv", + "--pipeline_report=pipeline_report.html", +]) + + +# Check if output exists +print(">> Check if output exists", flush=True) +assert (output_dir / "sample_Bioproduct_Stats.csv").exists() +assert (output_dir / "sample_Metrics_Summary.csv").exists() +assert (output_dir / "sample_Pipeline_Report.html").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_MEX.zip").exists() +assert (output_dir / "sample_RSEC_MolsPerCell_Unfiltered_MEX.zip").exists() +# seurat object is not generated when abc data is added +# assert (output_dir / "sample_Seurat.rds").exists() +assert (output_dir / "sample.h5mu").exists() + +# check individual outputs +# assert Path("seurat.rds").exists() +assert Path("mudata.h5mu").exists() +assert Path("metrics_summary.csv").exists() +assert Path("pipeline_report.html").exists() + +print(">> Check contents of output", flush=True) +data = md.read_h5mu("mudata.h5mu") + +assert data.n_obs == 100, "Number of cells is incorrect" +assert "rna" in data.mod, "RNA data is missing" +assert "prot" in data.mod, "Protein data is missing" + +# check rna data +data_rna = data.mod["rna"] +assert data_rna.n_vars == 1, "Number of genes is incorrect" +assert data_rna.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect" +# assert data_rna.var.Raw_Reads.sum() == 100000, "Number of reads is incorrect" +assert data_rna.var.Raw_Reads.sum() >= 99990 and data_rna.var.Raw_Reads.sum() <= 100010, \ + f"Expected 100000 RNA reads, got {data_rna.var.Raw_Reads.sum()}" + +# check prot data +data_prot = data.mod["prot"] +assert data_prot.n_vars == len(bdabseq_panel_fasta_dict), "Number of proteins is incorrect" +assert data_prot.X.sum(axis=1).min() > 950, "Number of reads per cell is incorrect" +assert data_prot.var.Raw_Reads.sum() >= 99990 and data_prot.var.Raw_Reads.sum() <= 100010, \ + f"Expected 100000 Prot reads, got {data_prot.var.Raw_Reads.sum()}" + + +# check smk data +expected_sample_tags = (["SampleTag01_hs", "SampleTag02_hs", "SampleTag03_hs"] * 34)[:100] +expected_sample_names = (["Sample1", "Sample2", "Sample3"] * 34)[:100] +sample_tags = data_rna.obs["Sample_Tag"] +assert sample_tags.nunique() == 3, "Number of sample tags is incorrect" +assert sample_tags.tolist() == expected_sample_tags, "Sample tags are incorrect" +sample_names = data_rna.obs["Sample_Name"] +assert sample_names.nunique() == 3, "Number of sample names is incorrect" +assert sample_names.tolist() == expected_sample_names, "Sample names are incorrect" + +# TODO: add VDJ, ATAC, and targeted RNA to test + +######################################################################################### + +print("> Test successful", flush=True) diff --git a/src/bd_rhapsody/helpers/rhapsody_cell_label.py b/src/bd_rhapsody/helpers/rhapsody_cell_label.py new file mode 100644 index 00000000..601ce7be --- /dev/null +++ b/src/bd_rhapsody/helpers/rhapsody_cell_label.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python + +# copied from https://bd-rhapsody-public.s3.amazonaws.com/CellLabel/rhapsody_cell_label.py.txt +# documented at https://bd-rhapsody-bioinfo-docs.genomics.bd.com/steps/steps_cell_label.html + +""" +Rhapsody cell label structure +Information on the cell label is captured by the combination of bases in three cell label sections (CLS1, CLS2, CLS3). +Two common linker sequences (L1, L2) separate the three CLS. + +--CLS1---|-L1-|--CLS2---|-L2-|--CL3---|--UMI---|-CaptureSequence- + + +Each cell label section has a whitelist of 96 or 384 possible 9 base sequences. +All the capture oligos from a single bead will have the same cell label. + +---------------- + +V1 beads: + +[A96_cell_key1] + [v1_linker1] + [A96_cell_key2] + [v1_linker2] + [A96_cell_key3] + [8 random base UMI] + [18 base polyT capture] + + +---------------- + +Enhanced beads: +Enhanced beads contain two different capture oligo types, polyT and 5prime. On any one bead, the two different capture oligo types have the same cell label sequences. +Compared to the V1 bead, enhanced beads have shorter linker sequences, longer polyT, and 0-3 diversity insert bases at the beginning of the sequence. +The cell label sections use the same 3 sequence whitelists as V1 beads. + +polyT capture oligo: +[Enh_insert 0-3 bases] + [A96_cell_key1] + [Enh_linker1] + [A96_cell_key2] + [Enh_linker2] + [A96_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [A96_cell_key1] + [Enh_5p_linker1] + [A96_cell_key2] + [Enh_5p_linker2] + [A96_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +---------------- + +Enhanced V2/V3 beads: +Enhanced V2/V3 beads have the same structure as Enhanced beads, but the cell label sections have been updated with increased diversity + + +polyT capture oligo: +[Enh_insert 0-3 bases] + [B384_cell_key1] + [Enh_linker1] + [B384_cell_key2] + [Enh_linker2] + [B384_cell_key3] + [8 random base UMI] + [25 base polyT capture] + +5prime capture oligo: +[Enh_5p_primer] + [B384_cell_key1] + [Enh_5p_linker1] + [B384_cell_key2] + [Enh_5p_linker2] + [B384_cell_key3] + [8 random base UMI] + [Tso_capture_seq] + + +The only difference between Enh V2 and Enh V3 beads is a different Tso_capture_seq. + +---------------- + +The Rhapsody Sequence Analysis Pipeline will convert each cell label into a single integer representing a unique cell label sequence - which is used in the output files as the 'Cell_index'. +This cell index integer is deterministic and derived from the 3 part cell label as follows: + +- Get the 1-based index for each cell label section from the python sets of sequences below +- Apply this equation: + (CLS1index - 1) * 384 * 384 + (CLS2index - 1) * 384 + CLS3index + +(See label_sections_to_index() function below) + + +Example: Enhanced bead sequence: +ACACATTGCAGTGAAGATAGTTCGACACTCAAGACA + +Each part identified: +A CACATTGCA GTGA AGATAGTTC GACA CTCAAGACA +DiversityInsert A96_cell_key1-33 Linker1 A96_cell_key2-78 Linker2 A96_cell_key3-21 + +33-78-21 +(33 - 1) * 384 * 384 + (78 - 1) * 384 + 21 +=4748181 + + +The original sequences of cell label can be determined from the cell index integer by reversing this conversion. +See index_to_label_sections() and index_to_sequence() functions below. + +""" + +v1_linker1 = 'ACTGGCCTGCGA' +v1_linker2 = 'GGTAGCGGTGACA' + +Enh_linker1 = 'GTGA' +Enh_linker2 = 'GACA' + +Enh_5p_primer = "ACAGGAAACTCATGGTGCGT" + +Enh_5p_linker1 = "AATG" +Enh_5p_linker2 = "CCAC" + +Enh_inserts = ["", "A", "GT", "TCA"] + +Tso_capture_seq_Enh_EnhV2 = "TATGCGTAGTAGGTATG" +Tso_capture_seq_EnhV3 = "GTGGAGTCGTGATTATA" + +A96_cell_key1 = ("GTCGCTATA","CTTGTACTA","CTTCACATA","ACACGCCGG","CGGTCCAGG","AATCGAATG","CCTAGTATA","ATTGGCTAA","AAGACATGC","AAGGCGATC", + "GTGTCCTTA","GGATTAGGA","ATGGATCCA","ACATAAGCG","AACTGTATT","ACCTTGCGG","CAGGTGTAG","AGGAGATTA","GCGATTACA","ACCGGATAG", + "CCACTTGGA","AGAGAAGTT","TAAGTTCGA","ACGGATATT","TGGCTCAGA","GAATCTGTA","ACCAAGGAC","AGTATCTGT","CACACACTA","ATTAAGTGC", + "AAGTAACCC","AAATCCTGT","CACATTGCA","GCACTGTCA","ATACTTAGG","GCAATCCGA","ACGCAATCA","GAGTATTAG","GACGGATTA","CAGCTGACA", + "CAACATATT","AACTTCTCC","CTATGAAAT","ATTATTACC","TACCGAGCA","TCTCTTCAA","TAAGCGTTA","GCCTTACAA","AGCACACAG","ACAGTTCCG", + "AGTAAAGCC","CAGTTTCAC","CGTTACTAA","TTGTTCCAA","AGAAGCACT","CAGCAAGAT","CAAACCGCC","CTAACTCGC","AATATTGGG","AGAACTTCC", + "CAAAGGCAC","AAGCTCAAC","TCCAGTCGA","AGCCATCAC","AACGAGAAG","CTACAGAAC","AGAGCTATG","GAGGATGGA","TGTACCTTA","ACACACAAA", + "TCAGGAGGA","GAGGTGCTA","ACCCTGACC","ACAAGGATC","ATCCCGGAG","TATGTGGCA","GCTGCCAAT","ATCAGAGCT","TCGAAGTGA","ATAGACGAG", + "AGCCCAATC","CAGAATCGT","ATCTCCACA","ACGAAAGGT","TAGCTTGTA","ACACGAGAT","AACCGCCTC","ATTTAGATG","CAAGCAAGC","CAAAGTGTG", + "GGCAAGCAA","GAGCCAATA","ATGTAATGG","CCTGAGCAA","GAGTACATT","TGCGATCTA" + ) + +A96_cell_key2 = ("TACAGGATA","CACCAGGTA","TGTGAAGAA","GATTCATCA","CACCCAAAG","CACAAAGGC","GTGTGTCGA","CTAGGTCCT","ACAGTGGTA","TCGTTAGCA", + "AGCGACACC","AAGCTACTT","TGTTCTCCA","ACGCGAAGC","CAGAAATCG","ACCAAAATG","AGTGTTGTC","TAGGGATAC","AGGGCTGGT","TCATCCTAA", + "AATCCTGAA","ATCCTAGGA","ACGACCACC","TTCCATTGA","TAGTCTTGA","ACTGTTAGA","ATTCATCGT","ACTTCGAGC","TTGCGTACA","CAGTGCCCG", + "GACACTTAA","AGGAGGCGC","GCCTGTTCA","GTACATCTA","AATCAGTTT","ACGATGAAT","TGACAGACA","ATTAGGCAT","GGAGTCTAA","TAGAACACA", + "AAATAAATA","CCGACAAGA","CACCTACCC","AAGAGTAGA","TCATTGAGA","GACCTTAGA","CAAGACCTA","GGAATGATA","AAACGTACC","ACTATCCTC", + "CCGTATCTA","ACACATGTC","TTGGTATGA","GTGCAGTAA","AGGATTCAA","AGAATGGAG","CTCTCTCAA","GCTAACTCA","ATCAACCGA","ATGAGTTAC", + "ACTTGATGA","ACTTTAACT","TTGGAGGTA","GCCAATGTA","ATCCAACCG","GATGAACTG","CCATGCACA","TAGTGACTA","AAACTGCGC","ATTACCAAG", + "CACTCGAGA","AACTCATTG","CTTGCTTCA","ACCTGAGTC","AGGTTCGCT","AAGGACTAT","CGTTCGGTA","AGATAGTTC","CAATTGATC","GCATGGCTA", + "ACCAGGTGT","AGCTGCCGT","TATAGCCCT","AGAGGACCA","ACAATATGG","CAGCACTTC","CACTTATGT","AGTGAAAGG","AACCCTCGG","AGGCAGCTA", + "AACCAAAGT","GAGTGCGAA","CGCTAAGCA","AATTATAAC","TACTAGTCA","CAACAACGG" + ) + +A96_cell_key3 = ("AAGCCTTCT","ATCATTCTG","CACAAGTAT","ACACCTTAG","GAACGACAA","AGTCTGTAC","AAATTACAG","GGCTACAGA","AATGTATCG","CAAGTAGAA", + "GATCTCTTA","AACAACGCG","GGTGAGTTA","CAGGGAGGG","TCCGTCTTA","TGCATAGTA","ACTTACGAT","TGTATGCGA","GCTCCTTGA","GGCACAACA", + "CTCAAGACA","ACGCTGTTG","ATATTGTAA","AAGTTTACG","CAGCCTGGC","CTATTAGCC","CAAACGTGG","AAAGTCATT","GTCTTGGCA","GATCAGCGA", + "ACATTCGGC","AGTAATTAG","TGAAGCCAA","TCTACGACA","CATAACGTT","ATGGGACTC","GATAGAGGA","CTACATGCG","CAACGATCT","GTTAGCCTA", + "AGTTGCATC","AAGGGAACT","ACTACATAT","CTAAGCTTC","ACGAACCAG","TACTTCGGA","AACATCCAT","AGCCTGGTT","CAAGTTTCC","CAGGCATTT", + "ACGTGGGAG","TCTCACGGA","GCAACATTA","ATGGTCCGT","CTATCATGA","CAATACAAG","AAAGAGGCC","GTAGAAGCA","GCTATGGAA","ACTCCAGGG", + "ACAAGTGCA","GATGGTCCA","TCCTCAATA","AATAAACAA","CTGTACGGA","CTAGATAGA","AGCTATGTG","AAATGGAGG","AGCCGCAAG","ACAGTAAAC", + "AACGTGTGA","ACTGAATTC","AAGGGTCAG","TGTCTATCA","TCAGATTCA","CACGATCCG","AACAGAAAC","CATGAATGA","CGTACTACG","TTCAGCTCA", + "AAGGCCGCA","GGTTGGACA","CGTCTAGGT","AATTCGGCG","CAACCTCCA","CAATAGGGT","ACAGGCTCC","ACAACTAGT","AGTTGTTCT","AATTACCGG", + "ACAAACTTT","TCTCGGTTA","ACTAGACCG","ACTCATACG","ATCGAGTCT","CATAGGTCA" + ) + +B384_cell_key1 = ("TGTGTTCGC","TGTGGCGCC","TGTCTAGCG","TGGTTGTCC","TGGTTCCTC","TGGTGTGCT","TGGCGACCG","TGCTGTGGC","TGCTGGCAC","TGCTCTTCC", + "TGCCTCACC","TGCCATTAT","TGATGTCTC","TGATGGCCT","TGATGCTTG","TGAAGGACC","TCTGTCTCC","TCTGATTAT","TCTGAGGTT","TCTCGTTCT", + "TCTCATCCG","TCCTGGATT","TCAGCATTC","TCACGCCTT","TATGTGCAC","TATGCGGCC","TATGACGAG","TATCTCGTG","TATATGACC","TAGGCTGTG", + "TACTGCGTT","TACGTGTCC","TAATCACAT","GTTGTGTTG","GTTGTGGCT","GTTGTCTGT","GTTGTCGAG","GTTGTCCTC","GTTGTATCC","GTTGGTTCT", + "GTTGGCGTT","GTTGGAGCG","GTTGCTGCC","GTTGCGCAT","GTTGCAGGT","GTTGCACTG","GTTGATGAT","GTTGATACG","GTTGAAGTC","GTTCTGTGC", + "GTTCTCTCG","GTTCTATAT","GTTCGTATG","GTTCGGCCT","GTTCGCGGC","GTTCGATTC","GTTCCGGTT","GTTCCGACG","GTTCACGCT","GTTATCACC", + "GTTAGTCCG","GTTAGGTGT","GTTAGAGAC","GTTAGACTT","GTTACCTCT","GTTAATTCC","GTTAAGCGC","GTGTTGCTT","GTGTTCGGT","GTGTTCCAG", + "GTGTTCATC","GTGTCACAC","GTGTCAAGT","GTGTACTGC","GTGGTTAGT","GTGGTACCG","GTGGCGATC","GTGCTTCTG","GTGCGTTCC","GTGCGGTAT", + "GTGCGCCTT","GTGCGAACT","GTGCAGCCG","GTGCAATTG","GTGCAAGGC","GTCTTGCGC","GTCTGGCCG","GTCTGAGGC","GTCTCAGAT","GTCTCAACC", + "GTCTATCGT","GTCGGTGTG","GTCGGAATC","GTCGCTCCG","GTCCTCGCC","GTCCTACCT","GTCCGCTTG","GTCCATTCT","GTCCAATAC","GTCATGTAT", + + "GTCAGTGGT","GTCAGATAG","GTATTAACT","GTATCAGTC","GTATAGCCT","GTATACTTG","GTATAAGGT","GTAGCATCG","GTACCGTCC","GTACACCTC", + "GTAAGTGCC","GTAACAGAG","GGTTGTGTC","GGTTGGCTG","GGTTGACGC","GGTTCGTCG","GGTTCAGTT","GGTTATATT","GGTTAATAC","GGTGTACGT", + "GGTGCCGCT","GGTGCATGC","GGTCGTTGC","GGTCGAGGT","GGTAGGCAC","GGTAGCTTG","GGTACATAG","GGTAATCTG","GGCTTGGCC","GGCTTCACG", + "GGCTTATGT","GGCTTACTC","GGCTGTCTT","GGCTCTGTG","GGCTCCGGT","GGCTCACCT","GGCGTTGAG","GGCGTGTAC","GGCGTGCTG","GGCGTATCG", + "GGCGCTCGT","GGCGCTACC","GGCGAGCCT","GGCGAGATC","GGCGACTTG","GGCCTCTTC","GGCCTACAG","GGCCAGCGC","GGCCAACTT","GGCATTCCT", + "GGCATCCGC","GGCATAACC","GGCAACGAT","GGATGTCCG","GGATGAGAG","GGATCTGGC","GGATCCATG","GGATAGGTT","GGAGTCGTG","GGAGAAGGC", + "GGACTCCTT","GGACTAGTC","GGACCGTTG","GGAATTAGT","GGAATCTCT","GGAATCGAC","GGAAGCCTC","GCTTGTAGC","GCTTGACCG","GCTTCGGAC", + "GCTTCACAT","GCTTAGTCT","GCTGGATAT","GCTGGAACC","GCTGCGATG","GCTGATCAG","GCTGAGCGT","GCTCTTGTC","GCTCTCCTG","GCTCGGTCC", + "GCTCCAATT","GCTATTCGC","GCTATGAGT","GCTAGTGTT","GCTAGGATC","GCTAGCACT","GCTACGTAT","GCTAACCTT","GCGTTCCGC","GCGTGTGCC", + "GCGTGCATT","GCGTCGGTT","GCGTATGTG","GCGTATACT","GCGGTTCAC","GCGGTCTTG","GCGGCGTCG","GCGGCACCT","GCGCTGGAC","GCGCTCTCC", + + "GCGCGGCAG","GCGCGATAC","GCGCCGACC","GCGAGCGAG","GCGAGAGGT","GCGAATTAC","GCCTTGCAT","GCCTGCGCT","GCCTAACTG","GCCGTCCGT", + "GCCGCTGTC","GCCATGCCG","GCCAGCTAT","GCCAACCAG","GCATGGTTG","GCATCGACG","GCAGGCTAG","GCAGGACGC","GCAGCCATC","GCAGATACC", + "GCAGACGTT","GCACTATGT","GCACACGAG","GATTGTCAT","GATTGGTAG","GATTGCACC","GATTCTACT","GATTCGCTT","GATTAGGCC","GATTACGGT", + "GATGTTGGC","GATGTTATG","GATGGCCAG","GATCGTTCG","GATCGGAGC","GATCGCCTC","GATCCTCTG","GATCCAGCG","GATACACGC","GAGTTACCT", + "GAGTCGTAT","GAGTCGCCG","GAGGTGTAG","GAGGCATTG","GAGCGGACG","GAGCCTGAG","GAGATCTGT","GAGATAATT","GAGACGGCT","GACTTCGTG", + "GACTGTTCT","GACTCTTAG","GACCGCATT","GAATTGAGC","GAATATTGC","GAAGGCTCT","GAAGAGACT","GAACTGCCG","GAACGCGTG","CTTGTGTAT", + "CTTGTGCGC","CTTGTCATG","CTTGGTCTT","CTTGGTACC","CTTGGATGT","CTTGCTCAC","CTTGCAATC","CTTGAGGCC","CTTGACGGT","CTTCTGATC", + "CTTCTCGTT","CTTCTAGGC","CTTCGTTAG","CTTATGTCC","CTTATGCTT","CTTATATAG","CTTAGGTTG","CTTAGGAGC","CTTACTTAT","CTGTTCTCG", + "CTGTGCCTC","CTGTCGCAT","CTGTCGAGC","CTGTAGCTG","CTGTACGTT","CTGCTTGCC","CTGCGTAGT","CTGCACACC","CTGATGGAT","CTGAGTCAT", + "CTGACGCCG","CTGAACGAG","CTCTTGTAG","CTCTTAGTT","CTCTTACCG","CTCTGCACC","CTCTCGTCC","CTCGTATTG","CTCGACTAT","CTCCTGACG", + + "CTCACTAGC","CTATACGGC","CGTTCGCTC","CGTTCACCG","CGTATAGTT","CGGTGTTCC","CGGTGTCAG","CGGTCCTGC","CGGCGACTC","CGGCACGGT", + "CGGATAGCC","CGGAGAGAT","CGCTAATAG","CGCGTTGGC","CGCGCAGAG","CGCACTGCC","CCTTGTCTC","CCTTGGCGT","CCTTCTGAG","CCTTCTCCT", + "CCTTCGACC","CCTTACTTG","CCTGTTCGT","CCTGTATGC","CCTCGGCCG","CCGTTAATT","CCATGTGCG","CCAGTGGTT","CCAGGCATT","CCAGGATCC", + "CCAGCGTTG","CATTCCGAT","CATTATACC","CATGTTGAG","ATTGCGTGT","ATTGCGGAC","ATTGCGCCG","ATTGACTTG","ATTCGGCTG","ATTCGCGAG", + "ATTCCAAGT","ATTATCTTC","ATTACTGTT","ATTACACTC","ATGTTCTAT","ATGTTACGC","ATGTGTATC","ATGTGGCAG","ATGTCTGTG","ATGGTGCAT", + "ATGCTTACT","ATGCTGTCC","ATGCTCGGC","ATGAGGTTC","ATGAGAGTG","ATCTTGGCT","ATCTGTGCG","ATCGGTTCC","ATCATGCTC","ATCATCACT", + "ATATCTTAT","ATAGGCGCC","AGTTGGTAT","AGTTGAGCC","AGTGCGACC","AGGTGCTAC","AGGCTTGCG","AGGCCTTCC","AGGCACCTT","AGGAATATG", + "AGCGGCCAG","AGCCTGGTC","AGCCTGACT","AGCAATCCG","AGAGATGTT","AGAGAATTC","ACTCGCTTG","ACTCGACCT","ACGTACACC","ACGGATGGT", + "ACCAGTCTG","ACATTCGGC","ACATGAGGT","ACACTAATT" + ) + +B384_cell_key2 = ("TTGTGTTGT","TTGTGGTAG","TTGTGCGGA","TTGTCTGTT","TTGTCTAAG","TTGTCATAT","TTGTCACGA","TTGTATGAA","TTGTACAGT","TTGGTTAAT", + "TTGGTGCAA","TTGGTCGAG","TTGGTATTA","TTGGCACAG","TTGGATACA","TTGGAAGTG","TTGCGGTTA","TTGCCATTG","TTGCACGCG","TTGCAAGGT", + "TTGATGTAT","TTGATAATT","TTGAGACGT","TTGACTACT","TTGACCGAA","TTCTGGTCT","TTCTGCACA","TTCTCCTTA","TTCTCCGCT","TTCTAGGTA", + "TTCTAATCG","TTCGTCGTA","TTCGTAGAT","TTCGGCTTG","TTCGGAATA","TTCGCCAGA","TTCGATTGT","TTCGATCAG","TTCCTCGGT","TTCCGGCAG", + "TTCCGCATT","TTCCAATTA","TTCATTGAA","TTCATGCTG","TTCAGGAGT","TTCACTATA","TTCAACTCT","TTCAACGTG","TTATGCGTT","TTATGATTG", + "TTATCCTGT","TTATCCGAG","TTATATTAT","TTAGGCGCG","TTACTGGAA","TTACTAGTT","TTACGTGGT","TTACGATAT","TTACCTAGA","TTACATGAG", + "TTACAGCGT","TTACACGGA","TTACACACT","TTAATCAGT","TTAATAGGA","TTAAGTGTG","TTAACCTTG","TTAACACAA","TGTTCACTT","TGTTCAAGA", + "TGTTAAGTG","TGTGTTATG","TGTGTCCAA","TGTGGAGCG","TGTCAGTTA","TGTCAGAAG","TGGTTAGTT","TGGTTACAA","TGGCGTTAT","TGGCGCCAA", + "TGGAGTCTT","TGCGTATTG","TGATAGAGA","TGAGGTATT","TGAGAATCT","TCTTGGTAA","TCTTCATAG","TCTGTCCTT","TCTGGAATT","TCTACCGCG", + "TCGTTCGAA","TCGTCAGTG","TCGACGAGA","TCATGGCTT","TCACACTTA","TATTCCGAA","TATTATGGT","TATGCTATT","TATCAAGGA","TAGTTCAAT", + + "TAGCTGCTT","TAGAGGAAG","TACCTGTTA","TACACCTGT","GTTGTGCGT","GTTGGCTAT","GTTGCCAAG","GTTGACCTT","GTTCTGCTA","GTTCTGAAT", + "GTTCTATCA","GTTCGCGTG","GTTCCTTAT","GTTAGCAGT","GTTACTGTG","GTTACTCAA","GTTAAGAGA","GTTAACTTA","GTGTCGGCA","GTGTCCATT", + "GTGCTTGAG","GTGCTCGTT","GTGCTCACA","GTGCCTGGA","GTCTTGTCG","GTCTTGATT","GTCTTCCGT","GTCTTAAGA","GTCTCATCT","GTCTACGAG", + "GTCGTTGCT","GTCGTGTTA","GTCGGTAAT","GTCGGATGT","GTCGAGCTG","GTCCGGACT","GTCCAACAT","GTCAGACGA","GTCAGAATT","GTCACTCTT", + "GTCAAGGAA","GTATGTCTT","GTATGTACA","GTATCGGTT","GTATATGTA","GTATACAAT","GTAGTTAAG","GTAGTCGAT","GTAGCCTTA","GTAGATACT", + "GTACGATTA","GTACAGTCT","GTAATTCGT","GCTTGGCAG","GCTTGCTTG","GCTTGAGGA","GCTTCATTA","GCTTATGCG","GCTGTGTAG","GCTGTCATG", + "GCTGGTTGT","GCTGGACTG","GCTGCCTAA","GCTGATATT","GCTCTTAGT","GCTCTATTG","GCTCGCCGT","GCTCCGCTG","GCTATTCTG","GCTATACGA", + "GCTACTAAG","GCTACATGT","GCTAACTCT","GCGTTGTAA","GCGTTCTCT","GCGTGCGTA","GCGTCTTGA","GCGTCCGAT","GCGTAAGAG","GCGCTTACG", + "GCGCGGATT","GCGCCATAT","GCGCATGAA","GCGATCAAT","GCGAGCCTT","GCGAGATTG","GCGAGAACA","GCCTTGGTA","GCCTTCTAG","GCCTTCACA", + "GCCTGAGTG","GCCTCACGT","GCCGGCGAA","GCCGCACAA","GCCATGCTT","GCCATATAT","GCCAATTCG","GCATTCGTT","GCATGATGT","GCAGTTGGA", + + "GCAGTGTCT","GCACTTGTG","GCAATCTGT","GCAACACTT","GATTGTATT","GATTGCGAG","GATTCCAGT","GATTCATAT","GATTATCAG","GATTAGGTT", + "GATGTTGCG","GATGGATCT","GATGCTGAT","GATGCCTTG","GATCTCCTT","GATCGCTTA","GATATTGAA","GATATTACT","GAGTGTTAT","GAGCTCAGT", + "GAGCGTGCT","GAGCGTCGA","GAGCGGTTG","GAGCGACTT","GAGCCGAAT","GAGATAGAT","GAGACCTAT","GACGGTCGT","GACGCAGGT","GACGATATG", + "GACCTATCT","GAATTAGGA","GAATCAGCT","GAAGTTCAT","GAAGTGGTT","GAAGTATTG","GAAGGCATT","GAACGCTGT","CTTGTCCAG","CTTGGATTG", + "CTTGCTGAA","CTTGCCGTG","CTTGATTCT","CTTCTGTCG","CTTCGGCGT","CTTATGAGT","CTTACCGAT","CTGTTAGGT","CTGTCGTCT","CTGTATAAT", + "CTGGCTCAT","CTGGATGCG","CTGCGTGTG","CTGCGCGGT","CTGCCGATT","CTGCATTGT","CTGATTAAG","CTGAGATAT","CTGACCTGT","CTCGTATCT", + "CTCGGCAAG","CTCGCAATT","CTCCTGCTT","CTCCTAAGT","CTCCGGATG","CTCCGAGCG","CTCACAGGT","CTATTCTAT","CTATTAGTG","CTATGAATT", + "CTACATATT","CGTGGCATT","CGTCTTAAT","CGTCTGGTT","CGTCACTGT","CGTAGGTCT","CGGTTCGAG","CGGTTCATT","CGGTGCTCT","CGGTAATTG", + "CGGCCTGAT","CGGATATAG","CGGAATATT","CGCTCCAAT","CGCGTTCGT","CGCAGGTTG","CGAGGATGT","CGAGCTGTT","CGACGGCTT","CCTTGTGTG", + "CCTGTCTCA","CCTGACTAT","CCTACCTTG","CCGTAGATT","CCGGCTGGT","CATCGGACG","CATCGATAA","CATCCTTCT","CAGTTCTGT","CAGTGCCAG", + + "CAGGCACTG","CAGCCTCTT","CACTTATAT","CACTGGTCG","CACTGCATG","CACGCGTTG","CACGATGTT","CACCATCTG","CACAGGCGT","ATTGTACAA", + "ATTGGTATG","ATTGCTAAT","ATTGCATAG","ATTGCAGTT","ATTCTGCAG","ATTCTACGT","ATTCGGATT","ATTCCGTTG","ATTCATCAA","ATTCAAGAG", + "ATTAGCCTT","ATTAATATT","ATGTTAGAG","ATGTTAACT","ATGTAGTCG","ATGGTGTAG","ATGGATTAT","ATCTTGAAG","ATCTGATAT","ATCTCAGAA", + "ATCGCTCAA","ATCGCGTCG","ATCCATGGT","ATCATGAGA","ATCATAGTT","ATCAGCGAG","ATCACCATT","ATAGTAATT","ATAGCTGTG","ATACTCTCG", + "ATACCTCAT","AGTTGCGCG","AGTTGAATT","AGTTATGAT","AGTGTCCGT","AGTGGCTTG","AGTGCTTCT","AGTATCATT","AGTACACAA","AGGTATGCG", + "AGGTATAGT","AGGCTACTT","AGGCCAGGT","AGGAGCGAT","AGCTTATAG","AGCTCTAGA","AGCGTGTAT","AGCGTCACA","AGCCTTCAT","AGCCTGTCG", + "AGCCTCGAG","AGCACTGAA","AGATGTACG","AGAGTTAAT","AGACCTCTG","ACTTCTATA","ACTGTCGAG","ACTGTATGT","ACTCTGTAA","ACTCGCGAA", + "ACTAGATCT","ACTAACGTT","ACGTTACTG","ACGTGGAAT","ACGGACTCT","ACGCCTAAT","ACGCCGTTA","ACGACGTGT","ACCTCGCAT","ACCATCATA", + "ACATATATT","ACAGGCACA","ACACCTGAG","ACACATTCT" + ) + +B384_cell_key3 = ("TTGTGGCTG","TTGTGGAGT","TTGTGCGAC","TTGTCTTCA","TTGTAAGAT","TTGGTTCTG","TTGGTGCGT","TTGGTCTAC","TTGGTAACT","TTGGCGTGC", + "TTGGATTAG","TTGGAGACG","TTGGAATCA","TTGCGGCGA","TTGCGCTCG","TTGCCTTAC","TTGCCGGAT","TTGCATGCT","TTGCACGTC","TTGCACCAT", + "TTGAACCTG","TTCTCGCGT","TTCTCAACT","TTCTACTCA","TTCGTCCAT","TTCGGATAC","TTCGGACGT","TTCGCAATC","TTCCGGTGC","TTCCGACTG", + "TTCATTATG","TTCATGGAT","TTCAGCGCA","TTCACCTCG","TTCAAGCAG","TTCAACTAC","TTATGCCAG","TTATGCATC","TTATCGTAC","TTATACCTA", + "TTATAATAG","TTATAAGTC","TTAGTTAGC","TTAGCTCAT","TTAGCACTA","TTAGATATG","TTACTACGA","TTACCGTCA","TTACAGAGC","TTAATTGCA", + "TTAACAGAT","TGTTGGCTA","TGTTGATGA","TGTTAAGCT","TGTGGCCGA","TGTGCTAGC","TGTGCGTCA","TGTCGCAGT","TGTCGAGCA","TGTACAACG", + "TGGTTCCGA","TGGTTCACT","TGGTCAAGT","TGGCTTGTA","TGGCTGTCG","TGGCGTATG","TGGCGCGCT","TGGATGTAC","TGGACTTGC","TGGAATACT", + "TGCTAGCGA","TGCGTTGCT","TGCGGTCTG","TGCGCTTAG","TGCGCGACG","TGCCTGCAT","TGCCTAGAC","TGCACGAGT","TGAGTGTGC","TGAGGCTCG", + "TCTTCCGTC","TCTTATAGT","TCTTACCAT","TCTGTTGTC","TCTGTTACT","TCTGGCTAG","TCTCAGATC","TCTAGTTGA","TCTAGTACG","TCGTACTAC", + "TCGGTGTAG","TCGGCTGCT","TCGCTACTG","TCGATCACG","TCGAGGCAT","TCCGGCGTC","TCCGGAGCT","TCCGCTCGT","TCCGAGTAC","TCCATTCAT", + + "TCCATGGTC","TCCAAGTCG","TCATTACGT","TCATGCACT","TCAGGTTGC","TCAGACCGT","TCACTCAGT","TCAAGCTCA","TATTGCGCA","TATTCGGCT", + "TATTCCAGC","TATTCATCA","TATGTTCAG","TATGGTATG","TATGCAAGT","TATCTGGTC","TATCTGACT","TATCCAGAT","TATCAGTCG","TATCACGCT", + "TAGGCGCGA","TAGGCACAT","TAGGATCGT","TAGCATTGC","TAGAGTTAC","TAGACTGAT","TACTTGTCG","TACGTCCGA","TACCGTACT","TACCGCGAT", + "TACCAGGAC","TACAGAAGT","TAAGTGCAT","TAAGCTACT","GTTGACCGA","GTTCTCGAC","GTTCCTGCT","GTTATGATG","GTGCTTGCA","GTGCCGCGT", + "GTATTGCTG","GTATTCCGA","GTATTAAGC","GTATGACGT","GTAGTTGTC","GTAGTACAT","GTAGCTCGA","GGTTGCTCA","GGTTGAGTA","GGTTAACGT", + "GGTGTGGCA","GGTCTTCAG","GGTCGTCTA","GGTCGGCGT","GGTCCGACT","GGTCATGTC","GGTCACATG","GGTAGTGCT","GGTAGCGTC","GGTACCAGT", + "GGTAAGGAT","GGCTTGTGC","GGCTTGACT","GGCTTACGA","GGCTGTAGT","GGCTGGCAG","GGCTCCATC","GGCGTGGAT","GGCGTAATC","GGCGCAAGT", + "GGCGAGTAG","GGCGACCGT","GGCCTGTCA","GGCCATTGC","GGCACTCTG","GGATGTCAT","GGAGTAACT","GGAGAACGA","GGACTGGCT","GGACGTTCA", + "GGAACGTGC","GCTGTCCAT","GCTGGTTCA","GCTGCAACT","GCTCGTTAC","GCTATAGAT","GCTAGTCGT","GCTACCATG","GCGTTCTGA","GCGTGTTAG", + "GCGGTATCG","GCGGAGCAT","GCGCGGTGC","GCGCCTAGT","GCGCCGGCT","GCCTTCATG","GCCATACTG","GCATGTTGA","GCATGCTAC","GCAGTATAC", + + "GCAGGTACT","GCAGCGCGT","GCACCTCAT","GCAATTCGA","GATTGCCGT","GATGAACAT","GATCTTCGA","GATCTGCAT","GAGTGGCAT","GAGTCGGAC", + "GAGTATGAT","GAGGCGAGT","GAGGCAACG","GAGCGCACT","GAATAGGCT","ATTGTCACT","ATTGTATCA","ATTGGTCAG","ATTGGCGAT","ATTGATCGT", + "ATTCGTAGT","ATTCATACG","ATTCAGGAC","ATTACTTCA","ATTAATTAG","ATTAAGCAT","ATGTCTCTA","ATGTAGCGT","ATGGCATAC","ATGGAGATC", + "ATGGACTCG","ATGGAACGA","ATGCTTCAT","ATGCTCGCT","ATGCGACGT","ATGCCGTAG","ATGAGTTCG","ATGACTATC","ATGACCGAC","ATCTTATGC", + "ATCTTACTA","ATCTATCAG","ATCGTGTAC","ATCGTCTGA","ATCGGCATG","ATCGCGAGC","ATCGCAACG","ATCGATGCT","ATCGAATAG","ATCCTTCTG", + "ATCCTGCGT","ATCCGCACT","ATCCATTAC","ATCCAAGCA","ATCAGATCA","ATCACACAT","ATCAACGTC","ATCAACCGA","ATATTGAGT","ATATTCGTC", + "ATATTACAG","ATATCTTGA","ATATCGCAT","ATATCAATC","ATAGTCCTG","ATAGGTCTA","ATAGCTGAC","ATAGCGGTA","AGTTCGCTG","AGTTACAGC", + "AGTTAACTA","AGTGCAATC","AGTCTGGTA","AGTCTGAGC","AGTCTACAT","AGTCGAACT","AGTCCATCG","AGTCATTCA","AGTATCCAG","AGTAGACTG", + "AGTAATCGA","AGTAAGTGC","AGGTTGGCT","AGGTTCTAG","AGGTGTTCA","AGGTGCCAT","AGGTCTGAT","AGGTCGTAC","AGGTCAGCA","AGGCTTATC", + "AGGCTATGA","AGGCCGACG","AGGCCAAGC","AGGCAGGTC","AGGCAAGAT","AGGAGCAGT","AGGACCGCT","AGGAATTAC","AGCTTGGAC","AGCTTAAGT", + + "AGCTACACG","AGCGTTACG","AGCGGTGCA","AGCGGAGTC","AGCGGACGA","AGCGCGCTA","AGCGATAGC","AGCGACTCA","AGCCTCTAC","AGCCGTCGT", + "AGCATGATC","AGCACTTCG","AGCACGGCA","AGATTCTGA","AGATTAGAT","AGATGATAG","AGATATGTA","AGATACCGT","AGAGTGCGT","AGAGCCGAT", + "AGACTCACT","ACTTGCCTA","ACTTGAGCA","ACTTCTAGC","ACTTCGACT","ACTTAGTAC","ACTGTTGAT","ACTGTAACG","ACTGGTATC","ACTGACGTC", + "ACTGAAGCT","ACTCTGATG","ACTCCTGAC","ACTCCGCTA","ACTCAACTG","ACTATTGCA","ACTAGGCAG","ACTACGCGT","ACTAATACT","ACGTTCGTA", + "ACGTGTGCT","ACGTGTATG","ACGTGGAGC","ACGTCTTCG","ACGTCAGTC","ACGGTCTCA","ACGGTCCGT","ACGGTACAG","ACGGCGCTG","ACGCTGCGA", + "ACGCGTGTA","ACGCGCCAG","ACGATGTCG","ACGATGGAT","ACGATCTAC","ACGAGCTGA","ACGAGCATC","ACGAATCGT","ACGAACGCA","ACCTTGTAG", + "ACCTGTTGC","ACCTGTCAT","ACCTCGATC","ACCTAGGTA","ACCTACTGA","ACCTAATCG","ACCGTAGCA","ACCGGTAGT","ACCGGCTAC","ACCGCTTCA", + "ACATTGTGC","ACATTCTCG","ACATGGCTG","ACATGACGA","ACATATGAT","ACATATACG","ACAGCGTAC","ACACTTGCT","ACACTATCA","ACACGCATG", + "ACACCAGTA","ACACCAACT","ACACATAGT","ACACACCTA" + ) + + +def label_sections_to_index(label): + """ + Return the cell_index integer based on input 3 part cell label string + + """ + + cl1, cl2, cl3 = [int(n) for n in label.split('-')] + return (cl1 - 1) * 384 * 384 + (cl2 - 1) * 384 + (cl3 - 1) + 1 + + +# print(label_sections_to_index('1-1-1')) +# print(label_sections_to_index('33-78-21')) +# print(label_sections_to_index('43-12-77')) +# print(label_sections_to_index('96-96-96')) +# print(label_sections_to_index('135-43-344')) +# print(label_sections_to_index('384-384-384')) +# print('-') + +#---------------------------------- + + +def index_to_label_sections(index): + + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + return f'{cl1}-{cl2}-{cl3}' + + +# print(index_to_label_sections(1)) +# print(index_to_label_sections(4748181)) +# print(index_to_label_sections(6197453)) +# print(index_to_label_sections(14044896)) +# print(index_to_label_sections(19775576)) +# print(index_to_label_sections(56623104)) +# print('-') +#---------------------------------- + + +def index_to_sequence(index, bead_version): + + zerobased = int(index) - 1 + + cl1 = (int((zerobased) / 384 / 384) % 384) + 1 + cl2 = (int((zerobased) / 384) % 384) + 1 + cl3 = (zerobased % 384) + 1 + + if bead_version == 'v1': + cls1_sequence = A96_cell_key1[cl1-1] + cls2_sequence = A96_cell_key2[cl2-1] + cls3_sequence = A96_cell_key3[cl3-1] + + return f'{cls1_sequence}{v1_linker1}{cls2_sequence}{v1_linker2}{cls3_sequence}' + + elif bead_version == 'Enh': + + diversityInsert = '' + + if 1 <= cl1 <= 24: + diversityInsert = '' + elif 25 <= cl1 <= 48: + diversityInsert = 'A' + elif 49 <= cl1 <= 72: + diversityInsert = 'GT' + else: # 73 <= cl1 <= 96: + diversityInsert = 'TCA' + + cls1_sequence = A96_cell_key1[cl1-1] + cls2_sequence = A96_cell_key2[cl2-1] + cls3_sequence = A96_cell_key3[cl3-1] + + return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + + elif bead_version == 'EnhV2': + + diversityInsert = '' + subIndex = ((cl1-1) % 96) + 1 + + if 1 <= subIndex <= 24: + diversityInsert = '' + elif 25 <= subIndex <= 48: + diversityInsert = 'A' + elif 49 <= subIndex <= 72: + diversityInsert = 'GT' + else: # 73 <= subIndex <= 96: + diversityInsert = 'TCA' + + cls1_sequence = B384_cell_key1[cl1-1] + cls2_sequence = B384_cell_key2[cl2-1] + cls3_sequence = B384_cell_key3[cl3-1] + + return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + + +# print(index_to_sequence(4748181, 'Enh')) +# print(index_to_sequence(52923177, 'EnhV2')) + +#---------------------------------- + + +def create_cell_index_fasta_V1(): + with open('Rhapsody_cellBarcodeV1_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 96+1): + for cl2 in range(1, 96+1): + for cl3 in range(1, 96+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'v1') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_V1() + + +def create_cell_index_fasta_Enh(): + with open('Rhapsody_cellBarcodeEnh_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 96+1): + for cl2 in range(1, 96+1): + for cl3 in range(1, 96+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'Enh') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_Enh() + +def create_cell_index_fasta_EnhV2(): + with open('Rhapsody_cellBarcodeEnhV2_IndexToSequence.fasta', 'w') as f: + for cl1 in range(1, 384+1): + for cl2 in range(1, 384+1): + for cl3 in range(1, 384+1): + index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') + sequence = index_to_sequence(index, 'EnhV2') + f.write(f'>{index}\n') + f.write(f'{sequence}\n') + +#create_cell_index_fasta_EnhV2() diff --git a/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta b/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta new file mode 100644 index 00000000..930add4a --- /dev/null +++ b/src/bd_rhapsody/test_data/BDAbSeq_ImmuneDiscoveryPanel.fasta @@ -0,0 +1,60 @@ +>CD11c:B-LY6|ITGAX|AHS0056|pAbO Catalog_940024 +ATGCGTTGCGAGAGATATGCGTAGGTTGCTGATTGG +>CD14:MPHIP9|CD14|AHS0037|pAbO Catalog_940005 +TGGCCCGTGGTAGCGCAATGTGAGATCGTAATAAGT +>CXCR5|CXCR5|AHS0039|pAbO Catalog_940042 +AGGAAGGTCGATTGTATAACGCGGCATTGTAACGGC +>CD19:SJ25C1|CD19|AHS0030|pAbO Catalog_940004 +TAGTAATGTGTTCGTAGCCGGTAATAATCTTCGTGG +>CD25:2A3|IL2RA|AHS0026|pAbO Catalog_940009 +AGTTGTATGGGTTAGCCGAGAGTAGTGCGTATGATT +>CD27:M-T271|CD27|AHS0025|pAbO Catalog_940018 +TGTCCGGTTTAGCGAATTGGGTTGAGTCACGTAGGT +>CD278|ICOS|AHS0012|pAbO Catalog_940043 +ATAGTCCGCCGTAATCGTTGTGTCGCTGAAAGGGTT +>CD279:EH12-1|PDCD1|AHS0014|pAbO Catalog_940015 +ATGGTAGTATCACGACGTAGTAGGGTAATTGGCAGT +>CD3:UCHT1|CD3E|AHS0231|pAbO Catalog_940307 +AGCTAGGTGTTATCGGCAAGTTGTACGGTGAAGTCG +>GITR|TNFRSF18|AHS0104|pAbO Catalog_940096 +TCTGTGTGTCGGGTTGAATCGTAGTGAGTTAGCGTG +>Tim3|HAVCR2|AHS0016|pAbO Catalog_940066 +TAGGTAGTAGTCCCGTATATCCGATCCGTGTTGTTT +>CD4:SK3|CD4|AHS0032|pAbO Catalog_940001 +TCGGTGTTATGAGTAGGTCGTCGTGCGGTTTGATGT +>CD45RA:HI100|PTPRC|AHS0009|pAbO Catalog_940011 +AAGCGATTGCGAAGGGTTAGTCAGTACGTTATGTTG +>CD56:NCAM16.2|NCAM1|AHS0019|pAbO Catalog_940007 +AGAGGTTGAGTCGTAATAATAATCGGAAGGCGTTGG +>CD62L:DREG-56|SELL|AHS0049|pAbO Catalog_940041 +ATGGTAAATATGGGCGAATGCGGGTTGTGCTAAAGT +>CCR7|CCR7|AHS0273|pAbO Catalog_940394 +AATGTGTGATCGGCAAAGGGTTCTCGGGTTAATATG +>CXCR6|CXCR6|AHS0148|pAbO Catalog_940234 +GTGGTTGGTTATTCGGACGGTTCTATTGTGAGCGCT +>CD127|IL7R|AHS0028|pAbO Catalog_940012 +AGTTATTAGGCTCGTAGGTATGTTTAGGTTATCGCG +>CD134:ACT35|TNFRSF4|AHS0013|pAbO Catalog_940060 +GGTGTTGGTAAGACGGACGGAGTAGATATTCGAGGT +>CD28:L293|CD28|AHS0138|pAbO Catalog_940226 +TTGTTGAGGATACGATGAAGCGGTTTAAGGGTGTGG +>CD272|BTLA|AHS0052|pAbO Catalog_940105 +GTAGGTTGATAGTCGGCGATAGTGCGGTTGAAAGCT +>CD8:SK1|CD8A|AHS0228|pAbO Catalog_940305 +AGGACATAGAGTAGGACGAGGTAGGCTTAAATTGCT +>HLA-DR|CD74|AHS0035|pAbO Catalog_940010 +TGTTGGTTATTCGTTAGTGCATCCGTTTGGGCGTGG +>CD16:3G8|FCGR3A|AHS0053|pAbO Catalog_940006 +TAAATCTAATCGCGGTAACATAACGGTGGGTAAGGT +>CD183|CXCR3|AHS0031|pAbO Catalog_940030 +AAAGTGTTGGCGTTATGTGTTCGTTAGCGGTGTGGG +>CD196|CCR6|AHS0034|pAbO Catalog_940033 +ACGTGTTATGGTGTTGTTCGAATTGTGGTAGTCAGT +>CD137|TNFRSF9|AHS0003|pAbO Catalog_940055 +TGACAAGCAACGAGCGATACGAAAGGCGAAATTAGT +>CD161:HP-3G10|KLRB1|AHS0205|pAbO Catalog_940283 +TTTAGGACGATTAGTTGTGCGGCATAGGAGGTGTTC +>IgM|IGHM|AHS0198|pAbO Catalog_940276 +TTTGGAGGGTAGCTAGTTGCAGTTCGTGGTCGTTTC +>IgD|IGHD|AHS0058|pAbO Catalog_940026 +TGAGGGATGTATAGCGAGAATTGCGACCGTAGACTT diff --git a/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta b/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta new file mode 100644 index 00000000..3d5a42fa --- /dev/null +++ b/src/bd_rhapsody/test_data/SampleTagSequences_HomoSapiens_ver1.fasta @@ -0,0 +1,24 @@ +>SampleTag01_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG +>SampleTag02_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG +>SampleTag03_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT +>SampleTag04_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT +>SampleTag05_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG +>SampleTag06_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC +>SampleTag07_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT +>SampleTag08_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT +>SampleTag09_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG +>SampleTag10_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC +>SampleTag11_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC +>SampleTag12_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/test_data/reference_small.fa similarity index 100% rename from src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa rename to src/bd_rhapsody/test_data/reference_small.fa diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/test_data/reference_small.gtf similarity index 100% rename from src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf rename to src/bd_rhapsody/test_data/reference_small.gtf diff --git a/src/bd_rhapsody/test_data/script.sh b/src/bd_rhapsody/test_data/script.sh new file mode 100644 index 00000000..f8db0313 --- /dev/null +++ b/src/bd_rhapsody/test_data/script.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf + +# download bdabseq immunediscoverypanel fasta +# note: was contained in http://bd-rhapsody-public.s3.amazonaws.com/Rhapsody-Demo-Data-Inputs/12WTA-ABC-SMK-EB-5kJRT.tar +cat > $OUT_DIR/BDAbSeq_ImmuneDiscoveryPanel.fasta <CD11c:B-LY6|ITGAX|AHS0056|pAbO Catalog_940024 +ATGCGTTGCGAGAGATATGCGTAGGTTGCTGATTGG +>CD14:MPHIP9|CD14|AHS0037|pAbO Catalog_940005 +TGGCCCGTGGTAGCGCAATGTGAGATCGTAATAAGT +>CXCR5|CXCR5|AHS0039|pAbO Catalog_940042 +AGGAAGGTCGATTGTATAACGCGGCATTGTAACGGC +>CD19:SJ25C1|CD19|AHS0030|pAbO Catalog_940004 +TAGTAATGTGTTCGTAGCCGGTAATAATCTTCGTGG +>CD25:2A3|IL2RA|AHS0026|pAbO Catalog_940009 +AGTTGTATGGGTTAGCCGAGAGTAGTGCGTATGATT +>CD27:M-T271|CD27|AHS0025|pAbO Catalog_940018 +TGTCCGGTTTAGCGAATTGGGTTGAGTCACGTAGGT +>CD278|ICOS|AHS0012|pAbO Catalog_940043 +ATAGTCCGCCGTAATCGTTGTGTCGCTGAAAGGGTT +>CD279:EH12-1|PDCD1|AHS0014|pAbO Catalog_940015 +ATGGTAGTATCACGACGTAGTAGGGTAATTGGCAGT +>CD3:UCHT1|CD3E|AHS0231|pAbO Catalog_940307 +AGCTAGGTGTTATCGGCAAGTTGTACGGTGAAGTCG +>GITR|TNFRSF18|AHS0104|pAbO Catalog_940096 +TCTGTGTGTCGGGTTGAATCGTAGTGAGTTAGCGTG +>Tim3|HAVCR2|AHS0016|pAbO Catalog_940066 +TAGGTAGTAGTCCCGTATATCCGATCCGTGTTGTTT +>CD4:SK3|CD4|AHS0032|pAbO Catalog_940001 +TCGGTGTTATGAGTAGGTCGTCGTGCGGTTTGATGT +>CD45RA:HI100|PTPRC|AHS0009|pAbO Catalog_940011 +AAGCGATTGCGAAGGGTTAGTCAGTACGTTATGTTG +>CD56:NCAM16.2|NCAM1|AHS0019|pAbO Catalog_940007 +AGAGGTTGAGTCGTAATAATAATCGGAAGGCGTTGG +>CD62L:DREG-56|SELL|AHS0049|pAbO Catalog_940041 +ATGGTAAATATGGGCGAATGCGGGTTGTGCTAAAGT +>CCR7|CCR7|AHS0273|pAbO Catalog_940394 +AATGTGTGATCGGCAAAGGGTTCTCGGGTTAATATG +>CXCR6|CXCR6|AHS0148|pAbO Catalog_940234 +GTGGTTGGTTATTCGGACGGTTCTATTGTGAGCGCT +>CD127|IL7R|AHS0028|pAbO Catalog_940012 +AGTTATTAGGCTCGTAGGTATGTTTAGGTTATCGCG +>CD134:ACT35|TNFRSF4|AHS0013|pAbO Catalog_940060 +GGTGTTGGTAAGACGGACGGAGTAGATATTCGAGGT +>CD28:L293|CD28|AHS0138|pAbO Catalog_940226 +TTGTTGAGGATACGATGAAGCGGTTTAAGGGTGTGG +>CD272|BTLA|AHS0052|pAbO Catalog_940105 +GTAGGTTGATAGTCGGCGATAGTGCGGTTGAAAGCT +>CD8:SK1|CD8A|AHS0228|pAbO Catalog_940305 +AGGACATAGAGTAGGACGAGGTAGGCTTAAATTGCT +>HLA-DR|CD74|AHS0035|pAbO Catalog_940010 +TGTTGGTTATTCGTTAGTGCATCCGTTTGGGCGTGG +>CD16:3G8|FCGR3A|AHS0053|pAbO Catalog_940006 +TAAATCTAATCGCGGTAACATAACGGTGGGTAAGGT +>CD183|CXCR3|AHS0031|pAbO Catalog_940030 +AAAGTGTTGGCGTTATGTGTTCGTTAGCGGTGTGGG +>CD196|CCR6|AHS0034|pAbO Catalog_940033 +ACGTGTTATGGTGTTGTTCGAATTGTGGTAGTCAGT +>CD137|TNFRSF9|AHS0003|pAbO Catalog_940055 +TGACAAGCAACGAGCGATACGAAAGGCGAAATTAGT +>CD161:HP-3G10|KLRB1|AHS0205|pAbO Catalog_940283 +TTTAGGACGATTAGTTGTGCGGCATAGGAGGTGTTC +>IgM|IGHM|AHS0198|pAbO Catalog_940276 +TTTGGAGGGTAGCTAGTTGCAGTTCGTGGTCGTTTC +>IgD|IGHD|AHS0058|pAbO Catalog_940026 +TGAGGGATGTATAGCGAGAATTGCGACCGTAGACTT +EOF + +# this was obtained by running the command: +# docker run bdgenomics/rhapsody:2.2.1 cat /rhapsody/control_files/SampleTagSequences_HomoSapiens_ver1.fasta +cat > $OUT_DIR/SampleTagSequences_HomoSapiens_ver1.fasta <SampleTag01_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTCAAGGGCAGCCGCGTCACGATTGGATACGACTGTTGGACCGG +>SampleTag02_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGGATGGGATAAGTGCGTGATGGACCGAAGGGACCTCGTGGCCGG +>SampleTag03_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGGCTCGTGCTGCGTCGTCTCAAGTCCAGAAACTCCGTGTATCCT +>SampleTag04_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGATTGGGAGGCTTTCGTACCGCTGCCGCCACCAGGTGATACCCGCT +>SampleTag05_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCTCCCTGGTGTTCAATACCCGATGTGGTGGGCAGAATGTGGCTGG +>SampleTag06_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTTACCCGCAGGAAGACGTATACCCCTCGTGCCAGGCGACCAATGC +>SampleTag07_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGTGTCTACGTCGGACCGCAAGAAGTGAGTCAGAGGCTGCACGCTGT +>SampleTag08_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCCCCACCAGGTTGCTTTGTCGGACGAGCCCGCACAGCGCTAGGAT +>SampleTag09_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGTGATCCGCGCAGGCACACATACCGACTCAGATGGGTTGTCCAGG +>SampleTag10_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCAGCCGGCGTCGTACGAGGCACAGCGGAGACTAGATGAGGCCCC +>SampleTag11_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGCGCGTCCAATTTCCGAAGCCCCGCCCTAGGAGTTCCCCTGCGTGC +>SampleTag12_hs|stAbO +GTTGTCAAGATGCTACCGTTCAGAGGCCCATTCATTGCACCCGCCAGTGATCGACCCTAGTGGAGCTAAG +EOF