From 01db1c205430b0bf62c68d3b67c4551900a73607 Mon Sep 17 00:00:00 2001 From: emmarousseau <emmarou1@icloud.com> Date: Sun, 25 Aug 2024 21:25:15 +0100 Subject: [PATCH] initial commit, files from rnaseq.vsh --- src/rsem/rsem_merge_counts/config.vsh.yaml | 61 ++++++++++++++++++++++ src/rsem/rsem_merge_counts/script.sh | 28 ++++++++++ 2 files changed, 89 insertions(+) create mode 100644 src/rsem/rsem_merge_counts/config.vsh.yaml create mode 100644 src/rsem/rsem_merge_counts/script.sh diff --git a/src/rsem/rsem_merge_counts/config.vsh.yaml b/src/rsem/rsem_merge_counts/config.vsh.yaml new file mode 100644 index 00000000..cb8ec9f2 --- /dev/null +++ b/src/rsem/rsem_merge_counts/config.vsh.yaml @@ -0,0 +1,61 @@ +name: "rsem_merge_counts" +namespace: "rsem" +description: Merge the transcript quantification results obtained from rsem calculate-expression across all samples. +keywords: [rsem, transcript, expression, counts] +links: + homepage: https://deweylab.github.io/RSEM/ + documentation: https://deweylab.github.io/RSEM/rsem-calculate-expression.html + repository: https://github.com/deweylab/RSEM +references: + doi: https://doi.org/10.1186/1471-2105-12-323 +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--counts_gene" + type: file + description: Expression counts on gene level (genes) + - name: "--counts_transcripts" + type: file + description: Expression counts on transcript level (isoforms) + +- name: "Output" + arguments: + - name: "--merged_gene_counts" + type: file + description: File containing gene counts across all samples. + example: rsem.merged.gene_counts.tsv + direction: output + - name: "--merged_gene_tpm" + type: file + description: File containing gene TPM across all samples. + example: rsem.merged.gene_tpm.tsv + direction: output + - name: "--merged_transcript_counts" + type: file + description: File containing transcript counts across all samples. + example: rsem.merged.transcript_counts.tsv + direction: output + - name: "--merged_transcript_tpm" + type: file + description: File containing transcript TPM across all samples. + example: rsem.merged.transcript_tpm.tsv + direction: output + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/rsem/rsem_merge_counts/script.sh b/src/rsem/rsem_merge_counts/script.sh new file mode 100644 index 00000000..524d44c0 --- /dev/null +++ b/src/rsem/rsem_merge_counts/script.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -ep pipefail + +mkdir -p tmp/genes +# cut -f 1,2 `ls $par_count_genes/*` | head -n 1` > gene_ids.txt +for file_id in ${par_count_genes[*]}; do + samplename=`basename $file_id | sed s/\\.genes.results\$//g` + echo $samplename > tmp/genes/${samplename}.counts.txt + cut -f 5 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.counts.txt + echo $samplename > tmp/genes/${samplename}.tpm.txt + cut -f 6 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.tpm.txt +done + +mkdir -p tmp/isoforms +# cut -f 1,2 `ls $par_counts_transcripts/*` | head -n 1` > transcript_ids.txt +for file_id in ${par_counts_transcripts[*]}; do + samplename=`basename $file_id | sed s/\\.isoforms.results\$//g` + echo $samplename > tmp/isoforms/${samplename}.counts.txt + cut -f 5 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.counts.txt + echo $samplename > tmp/isoforms/${samplename}.tpm.txt + cut -f 6 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.tpm.txt +done + +paste gene_ids.txt tmp/genes/*.counts.txt > $par_merged_gene_counts +paste gene_ids.txt tmp/genes/*.tpm.txt > $par_merged_gene_tpm +paste transcript_ids.txt tmp/isoforms/*.counts.txt > $par_merged_transcript_counts +paste transcript_ids.txt tmp/isoforms/*.tpm.txt > $par_merged_transcript_tpm