forked from tjblaette/ngs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deseq2_wrapper.sh
executable file
·110 lines (97 loc) · 4.38 KB
/
deseq2_wrapper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
####
# T.J.Blätte
# 2015
####
#
# Wrapper to initiate differential gene expression (DGE)
# analysis with DESeq2.
#
# Args:
# IN: Design table. A TSV file specifying at least the
# columns `fileName`, `sampleName` and the condition
# to test on for differential expression. Each line
# describes one sample. Filenames must refer to the
# respective read count files, which are expected to
# reside within the `intermediate_files` subfolder,
# within the directory from which the script is run. These
# count files are those output by HTSeq or STAR,
# following the respective formatting in the latter case.
# All of the information provided in this table will be
# used to annotate the respective output plots.
# DESIGN: Design formula to test for, something like
# `~ condition` for an unpaired analysis or
# `~ patient + condition` for a paired analysis.
# Provided factors must be defined as columns in $IN.
# REFERENCE_LEVEL: The reference level defines relative to what
# fold changes are calculated. Typically, this is something
# like `untreated`, `control` or similar. The term must
# be specified in the condition column of the design table
# that is tested.
# ALPHA: False discovery rate (FDR) cutoff that defines differentially
# expressed genes. Defaults to `0.1`.
# LFC: The minimum log fold change to test for between groups.
# Defaults to `0.6` (1.5 on a linear / non-log scale). Set it
# to `0` (1 on a linear / non-log scale) to test for **any**
# difference between groups.
# BIOMART: Database file for ensembl gene ID to gene symbol conversion.
# Defaults to '/NGS/known_sites/hg19/gencode.v19.chr_patch_hapl_scaff.annotation_UCSCcontigs.gtf'
#
# * Optional input files: *
# replace_sizeFactors.txt: File containing size factors calculated
# and used by DESeq2 to normalize for different samples' sequencing
# depth. If a file with this exact name is present, it will be used
# instead of the size factors calculated by DESeq2 during the analysis
# of the data in $IN. These files are saved during each analysis,
# though with a sample-dependent filename. Rename or symlink them to
# replace size factors in a subsequent analysis.
# candidates.txt: File containing genes or other features for which a
# separate set of output files is to be written. IDs must be provided
# one per line and must match those in the count files of $IN.
#
# Output:
# Several output files are written, all prefixed with a combination
# of $IN and $DESIGN:
# *counts*txt: Tables containing the normalized and
# log-transformed read counts of all samples.
# *woutNA.txt: Table containing the test statistics, fold changes,
# p- and FDR estimates of all genes.
# *pdf: PDF plots of principal components 1-6 from PCA and heatmaps
# of different hierarchical clusterings, for differentially expressed
# genes and increasing subsets of genes with a high coefficient of
# variation (CV).
#
####
set -e
set -u
IN="$1"
IN_PREFIX="$(basename "${IN%.tsv}")"
DESIGN="$2"
OUT_PREFIX="${IN_PREFIX}_DESeq2$(echo $DESIGN | sed 's/ //g')"
REFERENCE_LEVEL="$3"
ALPHA=${4:-0.1}
LFC=${5:-0.6}
#BIOMART=${6:-'/NGS/known_sites/human_ensembl_biomart_gene_ID_to_symbol/mart_export_sorted_woutLRG.txt'}
BIOMART=${6:-'/NGS/known_sites/hg19/gencode.v19.chr_patch_hapl_scaff.annotation_UCSCcontigs.gtf'}
GENE_ID_TO_SYMBOL_DICT="${BIOMART%.gtf}_geneIDtoSymbol.txt"
# check if the gene ID to Symbol conversion table exists already
# if it does not, create it now
if [ ! -e "$GENE_ID_TO_SYMBOL_DICT" ]
then
cat "$BIOMART" | cut -f9 | cut -f1,5 -d';' | sed -e 's/[a-z]\+_[a-z]\+//g' -e 's/;/\t/' -e 's/"//g' -e 's/ //g' | grep -v 'level' | sort -k1,1b | uniq > "$GENE_ID_TO_SYMBOL_DICT"
fi
# RUN DESEQ2 SCRIPT
# -> use the script in the same folder as this script
Rscript "$(dirname $0)/deseq2.R" "$IN" "$DESIGN" "$REFERENCE_LEVEL" "$ALPHA" "$LFC" "$OUT_PREFIX" "$GENE_ID_TO_SYMBOL_DICT"
echo ""
# FIX column header of gene-symbol annotated text files
for FILE in "${OUT_PREFIX}_"*txt
do
sed -i '1s/^\t/geneID_geneSymbol\t/' $FILE
done
# save separate DESeq2 output file with defined p-values only
for FILE in "${OUT_PREFIX}_all.txt"
do
[ -f "$FILE" ] || continue
grep -wv 'NA' "$FILE" > "${FILE%.txt}_woutNA.txt"
done