-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathanalysis_example.r
executable file
·238 lines (197 loc) · 7.15 KB
/
analysis_example.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
library(R.utils)
library(optparse)
library(rtracklayer)
library(plyr)
library(dplyr)
library(reshape)
library(DESeq2)
library(ggplot2)
library(ggrepel)
library(DOSE)
library(clusterProfiler)
library(biomaRt)
library(openxlsx)
library(VennDiagram)
library(grid)
library(gridExtra)
library(grDevices)
library(RColorBrewer)
library(mixOmics)
library(vsn)
###############################################################################
## ##
## FUNCTIONS ##
## ##
###############################################################################
###
get_counts <-
# take a file path and return a list of two data frames : the whole
# result file ($df) and just the count ($count)
function(filepath,
count_colname="expected_count",
geneid_colname="gene_id") {
# extract the sample ID from the file name
sample_id <- sub("\\..*$", "", basename(filepath))
# load the result file and add the sample ID
df <- read.table(filepath, header=T)
df$sample_id <- sample_id
# the count data frame is composed of just one column storing the counts.
# This column is named with the sample ID and the row names are
# corresponding to the gene IDs.
count <- data.frame(df[,count_colname])
rownames(count) <- df[,geneid_colname]
names(count) <- sample_id
return(list(df=df, count=count))
}
###############################################################################
## ##
## ARGUMENTS PARSING ##
## ##
###############################################################################
###################
## THE ARGUMENTS ##
###################
## the directory that contains the STAR results files
results.opt <- make_option(
c("-r", "--results-dir"),
type="character",
default=NULL,
help="results files directory absolute path"
)
## the design file that maps sample names to conditions
design.opt <- make_option(
c("-d", "--design-file"),
type="character",
default=NULL,
help="design file absolute path"
)
## the binomial which is required for gene ontology analysis
binom.opt <- make_option(
c("-b", "--binomial"),
type="character",
default=NULL,
help="binomial"
)
## the design file that maps sample names to conditions
annot.opt <- make_option(
c("-a", "--annot-file"),
type="character",
default=NULL,
help="annotation file absolute path"
)
## the q value threshold for the differentially expressed genes
thresh.opt <- make_option(
c("-t", "--q-value-threshold"),
type="double",
default=0.05,
help="q value threshold for significant genes"
)
## the output directory
output.opt <- make_option(
c("-o", "--output-dir"),
type="character",
default=".",
help="output directory absolute path"
)
#############
## PARSING ##
#############
## create the options and parse them
options = list(results.opt,
design.opt,
binom.opt,
annot.opt,
thresh.opt,
output.opt)
parser = OptionParser(option_list=options)
args = parse_args(parser)
## count the minimal number of argument
default.values <- lapply(options, function(x)as.character(attr(x, "default")))
min.arg.count <- length(which(unlist(default.values)=="\001NULL\001"))
## it has to be at least min.arg.count arguments
if (length(args)-1<min.arg.count) { # "args" contains also $help, so remove 1
print_help(parser)
stop()
}
## after checking, everything shoud be ok
RESULTS_DIRPATH <- args[["results-dir"]]
DESIGN_FILEPATH <- args[["design-file"]]
BINOMIAL <- args[["binomial"]]
ANNOT_FILEPATH <- args[["annot-file"]]
Q_VALUE_THRESHOLD <- args[["q-value-threshold"]]
OUTPUT_DIRPATH <- args[["output-dir"]]
###############################################################################
## ##
## DESeq2 ##
## ##
###############################################################################
## very annoying
dummy.path <- file.path(OUTPUT_DIRPATH, "dummy.pdf")
pdf(dummy.path)
# get the sorted file names
results.files <- list.files(
path=RESULTS_DIRPATH,
recursive=T,
full.name=T,
pattern=".*genes.results$"
)
results.files <- sort(results.files)
# import the results as either complete data frames or count data frames
results <- lapply(results.files, get_counts)
names(results) <- unlist(lapply(results, function(x)x[["df"]][1,"sample_id"]))
# from the results list, create a complete long data frame
df <- do.call(rbind, lapply(results, function(x)return(x$df)))
# from the results list, create a wide data frame with the just the counts
count <- do.call(cbind, lapply(results, function(x)return(x$count)))
# remove 0s
count <- count[rowMeans(count)>0,]
# design table
design <- read.csv(DESIGN_FILEPATH, header=T)
design <- as.data.frame(apply(design, 2, as.character))
design <- design[, c("sample", "treatment")]
condition <- design[ match(names(count), design[,"sample"]) , "treatment" ]
condition <- as.factor(condition)
# the DESeq2 matrix
dds <-
DESeq2::DESeqDataSetFromMatrix(countData=round(count),
colData=DataFrame(condition=condition),
design=as.formula("~ condition"))
# estimating size factors and dispersion
dds <- DESeq2::DESeq(dds)
# the differentially expressed genes
results <- DESeq2::results(dds,
alpha=.05,
contrast=c("condition", "treated", "untreated"))
write.csv(results, file.path(OUTPUT_DIRPATH, "diff_genes.csv"))
# the pca plot
pca <- plotPCA(rlog(dds), intgroup="condition", ntop=1000)
pdf(file.path(OUTPUT_DIRPATH, "pca_plot.pdf"))
print(pca)
dev.off()
png(file.path(OUTPUT_DIRPATH, "pca_plot.png"), type="cairo")
print(pca)
dev.off()
# gene ontology for biological process
ontology <- clusterProfiler::enrichGO(rownames(results),
OrgDb="org.Hs.eg.db",
keytype="ENSEMBL",
ont="BP",
pAdjustMetho="BH",
pvalueCutoff=.01,
qvalueCutoff=.05,
readable=T)
# dot plot for gene ontology
dplot <- DOSE::dotplot(ontology)
dplot$data$Description <- substr(dplot$data$Description, 0, 50)
dplot$data$Description <- make.unique(dplot$data$Description, sep = "_")
idx <- order(dplot$data$GeneRatio, decreasing=F)
dplot$data$Description <-
factor(dplot$data$Description, levels=dplot$data$Description[idx])
ggsave(file.path(OUTPUT_DIRPATH, "ontology_biological_process.pdf"), dplot)
ggsave(file.path(OUTPUT_DIRPATH, "ontology_biological_process.png"), dplot,
type="cairo")
## very annoying
dev.off()
file.remove(dummy.path)