From 01db1c205430b0bf62c68d3b67c4551900a73607 Mon Sep 17 00:00:00 2001
From: emmarousseau <emmarou1@icloud.com>
Date: Sun, 25 Aug 2024 21:25:15 +0100
Subject: [PATCH] initial commit, files from rnaseq.vsh

---
 src/rsem/rsem_merge_counts/config.vsh.yaml | 61 ++++++++++++++++++++++
 src/rsem/rsem_merge_counts/script.sh       | 28 ++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 src/rsem/rsem_merge_counts/config.vsh.yaml
 create mode 100644 src/rsem/rsem_merge_counts/script.sh

diff --git a/src/rsem/rsem_merge_counts/config.vsh.yaml b/src/rsem/rsem_merge_counts/config.vsh.yaml
new file mode 100644
index 00000000..cb8ec9f2
--- /dev/null
+++ b/src/rsem/rsem_merge_counts/config.vsh.yaml
@@ -0,0 +1,61 @@
+name: "rsem_merge_counts"
+namespace: "rsem"
+description: Merge the transcript quantification results obtained from rsem calculate-expression across all samples.
+keywords: [rsem, transcript, expression, counts]
+links:
+  homepage: https://deweylab.github.io/RSEM/
+  documentation: https://deweylab.github.io/RSEM/rsem-calculate-expression.html
+  repository: https://github.com/deweylab/RSEM
+references: 
+  doi: https://doi.org/10.1186/1471-2105-12-323
+license: GPL-3.0
+
+argument_groups:
+- name: "Input"
+  arguments:
+  - name: "--counts_gene"
+    type: file
+    description: Expression counts on gene level (genes)
+  - name: "--counts_transcripts"
+    type: file
+    description: Expression counts on transcript level (isoforms)
+
+- name: "Output"
+  arguments: 
+  - name: "--merged_gene_counts"
+    type: file
+    description: File containing gene counts across all samples.
+    example: rsem.merged.gene_counts.tsv
+    direction: output
+  - name: "--merged_gene_tpm"
+    type: file
+    description: File containing gene TPM across all samples.
+    example: rsem.merged.gene_tpm.tsv
+    direction: output
+  - name: "--merged_transcript_counts"
+    type: file
+    description: File containing transcript counts across all samples.
+    example: rsem.merged.transcript_counts.tsv
+    direction: output
+  - name: "--merged_transcript_tpm"
+    type: file
+    description: File containing transcript TPM across all samples.
+    example: rsem.merged.transcript_tpm.tsv
+    direction: output
+
+resources:
+  - type: bash_script
+    path: script.sh
+
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+
+engines:
+  - type: docker
+    image: ubuntu:22.04
+    
+runners:
+  - type: executable
+  - type: nextflow
\ No newline at end of file
diff --git a/src/rsem/rsem_merge_counts/script.sh b/src/rsem/rsem_merge_counts/script.sh
new file mode 100644
index 00000000..524d44c0
--- /dev/null
+++ b/src/rsem/rsem_merge_counts/script.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ep pipefail
+
+mkdir -p tmp/genes
+# cut -f 1,2 `ls $par_count_genes/*` | head -n 1` > gene_ids.txt
+for file_id in ${par_count_genes[*]}; do
+    samplename=`basename $file_id | sed s/\\.genes.results\$//g`
+    echo $samplename > tmp/genes/${samplename}.counts.txt
+    cut -f 5 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.counts.txt
+    echo $samplename > tmp/genes/${samplename}.tpm.txt
+    cut -f 6 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.tpm.txt
+done
+
+mkdir -p tmp/isoforms
+# cut -f 1,2 `ls $par_counts_transcripts/*` | head -n 1` > transcript_ids.txt
+for file_id in ${par_counts_transcripts[*]}; do
+    samplename=`basename $file_id | sed s/\\.isoforms.results\$//g`
+    echo $samplename > tmp/isoforms/${samplename}.counts.txt
+    cut -f 5 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.counts.txt
+    echo $samplename > tmp/isoforms/${samplename}.tpm.txt
+    cut -f 6 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.tpm.txt
+done
+
+paste gene_ids.txt tmp/genes/*.counts.txt > $par_merged_gene_counts
+paste gene_ids.txt tmp/genes/*.tpm.txt > $par_merged_gene_tpm
+paste transcript_ids.txt tmp/isoforms/*.counts.txt > $par_merged_transcript_counts
+paste transcript_ids.txt tmp/isoforms/*.tpm.txt > $par_merged_transcript_tpm