From 7d99065ecf66e6bc42b03f8ffcfcfc95ef2d2b72 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 17:46:44 +0200 Subject: [PATCH] `bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (#75) * `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline * add missing metadata * remove unicode * trigger * process comments * add authors * Apply suggestions from code review Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --------- Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --- CHANGELOG.md | 6 + src/_authors/robrecht_cannoodt.yaml | 14 ++ src/_authors/weiwei_schultz.yaml | 5 + .../config.vsh.yaml | 143 ++++++++++++++++ .../bd_rhapsody_make_reference/help.txt | 66 +++++++ .../make_rhap_reference_2.2.1_nodocker.cwl | 115 +++++++++++++ .../bd_rhapsody_make_reference/script.py | 161 ++++++++++++++++++ .../bd_rhapsody_make_reference/test.sh | 68 ++++++++ .../test_data/reference_small.fa | 27 +++ .../test_data/reference_small.gtf | 8 + .../test_data/script.sh | 47 +++++ 11 files changed, 660 insertions(+) create mode 100644 src/_authors/robrecht_cannoodt.yaml create mode 100644 src/_authors/weiwei_schultz.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/help.txt create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/script.py create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test.sh create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b8b9f3..9cfacdbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # biobox x.x.x +## NEW FEATURES + +* `bd_rhapsody`: + + - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). + ## BUG FIXES * `pear`: fix component not exiting with the correct exitcode when PEAR fails. diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml new file mode 100644 index 00000000..d7c0f283 --- /dev/null +++ b/src/_authors/robrecht_cannoodt.yaml @@ -0,0 +1,14 @@ +name: Robrecht Cannoodt +info: + links: + email: robrecht@data-intuitive.com + github: rcannood + orcid: "0000-0003-3641-729X" + linkedin: robrechtcannoodt + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Science Engineer + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml new file mode 100644 index 00000000..324f9378 --- /dev/null +++ b/src/_authors/weiwei_schultz.yaml @@ -0,0 +1,5 @@ +name: Weiwei Schultz +info: + organizations: + - name: Janssen R&D US + role: Associate Director Data Sciences \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml new file mode 100644 index 00000000..e596bf06 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -0,0 +1,143 @@ +name: bd_rhapsody_make_reference +namespace: bd_rhapsody +description: | + The Reference Files Generator creates an archive containing Genome Index + and Transcriptome annotation files needed for the BD Rhapsody Sequencing + Analysis Pipeline. The app takes as input one or more FASTA and GTF files + and produces a compressed archive in the form of a tar.gz file. The + archive contains: + + - STAR index + - Filtered GTF file +keywords: [genome, reference, index, align] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1/Extra_Utilities/ + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com/resources/extra_utilities.html#make-rhapsody-reference +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --genome_fasta + required: true + description: Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + example: genome_sequence.fa.gz + multiple: true + info: + config_key: Genome_fasta + - type: file + name: --gtf + required: true + description: | + File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or + 'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons + must have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or + 'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively. + example: transcriptome_annotation.gtf.gz + multiple: true + info: + config_key: Gtf + - type: file + name: --extra_sequences + description: | + File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes). + GTF lines for these sequences will be automatically generated and combined with the main GTF. + required: false + multiple: true + info: + config_key: Extra_sequences + - name: Outputs + arguments: + - type: file + name: --reference_archive + direction: output + required: true + description: | + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an + input in the BD Rhapsody Sequencing Analysis Pipeline. + example: star_index.tar.gz + - name: Arguments + arguments: + - type: string + name: --mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + info: + config_key: Mitochondrial_contigs + - type: boolean_true + name: --filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + info: + config_key: Filtering_off + - type: boolean_true + name: --wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + info: + config_key: Wta_Only + - type: string + name: --extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + info: + config_key: Extra_STAR_params + +resources: + - type: python_script + path: script.py + - path: make_rhap_reference_2.2.1_nodocker.cwl + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt new file mode 100644 index 00000000..cd038b25 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt @@ -0,0 +1,66 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl + [-h] [--Archive_prefix ARCHIVE_PREFIX] + [--Extra_STAR_params EXTRA_STAR_PARAMS] + [--Extra_sequences EXTRA_SEQUENCES] [--Filtering_off] --Genome_fasta + GENOME_FASTA --Gtf GTF [--Maximum_threads MAXIMUM_THREADS] + [--Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS] [--WTA_Only] + [job_order] + +The Reference Files Generator creates an archive containing Genome Index and +Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing +Analysis Pipeline. The app takes as input one or more FASTA and GTF files and +produces a compressed archive in the form of a tar.gz file. The archive +contains:\n - STAR index\n - Filtered GTF file + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --Archive_prefix ARCHIVE_PREFIX + A prefix for naming the compressed archive file + containing the Reference genome index and annotation + files. The default value is constructed based on the + input Reference files. + --Extra_STAR_params EXTRA_STAR_PARAMS + Additional parameters to pass to STAR when building + the genome index. Specify exactly like how you would + on the command line. Example: --limitGenomeGenerateRAM + 48000 --genomeSAindexNbases 11 + --Extra_sequences EXTRA_SEQUENCES + Additional sequences in FASTA format to use when + building the STAR index. (E.g. phiX genome) + --Filtering_off By default the input Transcript Annotation files are + filtered based on the gene_type/gene_biotype + attribute. Only features having the following + attribute values are are kept: - protein_coding - + lncRNA (lincRNA and antisense for Gencode < + v31/M22/Ensembl97) - IG_LV_gene - IG_V_gene - + IG_V_pseudogene - IG_D_gene - IG_J_gene - + IG_J_pseudogene - IG_C_gene - IG_C_pseudogene - + TR_V_gene - TR_V_pseudogene - TR_D_gene - TR_J_gene - + TR_J_pseudogene - TR_C_gene If you have already pre- + filtered the input Annotation files and/or wish to + turn-off the filtering, please set this option to + True. + --Genome_fasta GENOME_FASTA + Reference genome file in FASTA format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 + for Human and GRCm39 for Mouse. + --Gtf GTF Transcript annotation files in GTF format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode + v42 for Human and M31 for Mouse. + --Maximum_threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS + Names of the Mitochondrial contigs in the provided + Reference Genome. Fragments originating from contigs + other than these are identified as 'nuclear fragments' + in the ATACseq analysis pipeline. + --WTA_Only Build a WTA only index, otherwise builds a WTA + ATAC + index. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py new file mode 100644 index 00000000..ca635508 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -0,0 +1,161 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +par = { + "genome_fasta": [], + "gtf": [], + "extra_sequences": [], + "mitochondrial_contigs": ["chrM", "chrMT", "M", "MT"], + "filtering_off": False, + "wta_only_index": False, + "extra_star_params": None, + "reference_archive": "output.tar.gz", +} +meta = { + "config": "target/nextflow/reference/build_bdrhap_2_reference/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["all_arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".tar.gz"), "Output reference_archive must end with .tar.gz." + + # make paths absolute + for argument in config["all_arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + else: + par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) + + return par + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |""")] + + + config_key_value_pairs = [] + for argument in config["all_arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {par_value} + |""")) + + ## Write config to file + return "".join(content_list) + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + + return cwl_file + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call( + cmd, + cwd=os.path.dirname(config_file), + env=env + ) + + shutil.move(os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"]) + +if __name__ == "__main__": + main(par, meta) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh new file mode 100644 index 00000000..3637160a --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + # () will execute in a shubshell, could you use {;}? + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + # [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + # grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + # grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} + +in_fa="$meta_resources_dir/test_data/reference_small.fa" +in_gtf="$meta_resources_dir/test_data/reference_small.gtf" + +echo "#############################################" +echo "> Simple run" + +mkdir simple_run +cd simple_run + +out_tar="myreference.tar.gz" + +echo "> Running $meta_name." +$meta_executable \ + --genome_fasta "$in_fa" \ + --gtf "$in_gtf" \ + --reference_archive "$out_tar" \ + --extra_star_params "--genomeSAindexNbases 6" \ + ---cpus 2 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +assert_file_exists "$out_tar" +assert_file_not_empty "$out_tar" + +echo ">> Checking whether output contains the expected files" +tar -xvf "$out_tar" > /dev/null +assert_file_exists "BD_Rhapsody_Reference_Files/star_index/genomeParameters.txt" +assert_file_exists "BD_Rhapsody_Reference_Files/bwa-mem2_index/reference_small.ann" +assert_file_exists "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" +assert_file_exists "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" +assert_file_contains "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" "chr1.*HAVANA.*ENSG00000243485" +assert_file_contains "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" 'chrMT' + +cd .. + +echo "#############################################" + +echo "> Tests succeeded!" \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa new file mode 100644 index 00000000..386d887c --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa @@ -0,0 +1,27 @@ +>chr1 1 +TGGGGAAGCAAGGCGGAGTTGGGCAGCTCGTGTTCAATGGGTAGAGTTTCAGGCTGGGGT +GATGGAAGGGTGCTGGAAATGAGTGGTAGTGATGGCGGCACAACAGTGTGAATCTACTTA +ATCCCACTGAACTGTATGCTGAAAAATGGTTTAGACGGTGAATTTTAGGTTATGTATGTT +TTACCACAATTTTTAAAAAGCTAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTT +TAAAAAGTTAAATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCT +GGAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTTAGGGTTGCCAGATTAA +AAGACAGGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCGTAGCATAA +ATATGTCCCAAGCTTAGTTTGGGACATACTTATGCTAAAAAACATTATTGGTTGTTTATC +TGAGATTCAGAATTAAGCATTTTATATTTTATTTGCTGCCTCTGGCCACCCTACTCTCTT +CCTAACACTCTCTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAG +TTAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTGTGGTCCCCT +TACCAGAGGTAAAGAAGAGATGGATCTCCACTCATGTTGTAGACAGAATGTTTATGTCCT +CTCCAAATGCTTATGTTGAAACCCTAACCCCTAATGTGATGGTATGTGGAGATGGGCCTT +TGGTAGGTAATTACGGTTAGATGAGGTCATGGGGTGGGGCCCTCATTATAGATCTGGTAA +GAAAAGAGAGCATTGTCTCTGTGTCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTC +TCTATCTCATTTCTCTCTCTCTCGCTATCTCATTTTTCTCTCTCTCTCTTTCTCTCCTCT +GTCTTTTCCCACCAAGTGAGGATGCGAAGAGAAGGTGGCTGTCTGCAAACCAGGAAGAGA +GCCCTCACCGGGAACCCGTCCAGCTGCCACCTTGAACTTGGACTTCCAAGCCTCCAGAAC +TGTGAGGGATAAATGTATGATTTTAAAGTCGCCCAGTGTGTGGTATTTTGTTTTGACTAA +TACAACCTGAAAACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTA +CTCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCCTGGGTAAAG +CAGCCTGGCACAGAGGGAAGCACACAGGCTCAGGGATCTGCTATTCATTCTTTGTGTGAC +CCTGGGCAAGCCATGAATGGAGCTTCAGTCACCCCATTTGTAATGGGATTTAATTGTGCT +TGCCCTGCCTCCTTTTGAGGGCTGTAGAGAAAAGATGTCAAAGTATTTTGTAATCTGGCT +GGGCGTGGTGGCTCATGCCTGTAATCCTAGCACTTTGGTAGGCTGACGCGAGAGGACTGC +T diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf new file mode 100644 index 00000000..7ba83523 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf @@ -0,0 +1,8 @@ +chr1 HAVANA exon 565 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA exon 977 1098 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA transcript 268 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 268 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 977 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 ENSEMBL gene 367 504 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; hgnc_id "HGNC:35294"; +chr1 ENSEMBL transcript 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; +chr1 ENSEMBL exon 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh new file mode 100644 index 00000000..8d468064 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf