-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsva_analysis.R
232 lines (182 loc) · 8.66 KB
/
sva_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
setwd("C:/Users/Naugh/Dropbox (GaTech)/Gibson/Working/SickleCell/metadata/RNA/")
library(dplyr)
library(sva)
library(snm)
library(pvca)
library(Biobase)
library(ExpressionNormalizationWorkflow)
########### Prep metadata ####################################
pheno <- readxl::read_xlsx("../pain-omics-phenotype/Pain omics Phenotype_230122.xlsx",
sheet = "TORIDs Passing QC with measurem")
pheno <- as.data.frame(pheno)
# Subset down to samples with NWGC IDs associated with CD71 TORIDs
pheno <- pheno[!is.na(pheno$CD71_NWGC_ID),]
#Select columns for downstream analysis
feature_col <- c("CD71_NWGC_ID", "Subject_ID", "CD71_libprep_batch", "Sex", "Timepoint..at.RNA.collection.", "Chronic.Pain.")
pheno <- pheno %>% select(any_of(feature_col))
#Drop rows with NA
# IS THIS NECESSARY?
pheno <- na.omit(pheno)
# Make metadata column grouping points\
base <- c("Baseline", "HU", "HU MTD", "Metformin Baseline", "Metformin", "Metformin MTD")
pheno$steadyState.vs.voc <- ifelse(pheno$Timepoint..at.RNA.collection. %in% base, "Steady State", pheno$Timepoint..at.RNA.collection.)
pheno <- pheno[which(pheno$steadyState.vs.voc!="Inpatient F/U"),] # Get id of VOC F/U
#Subset to unique individuals with baseline & VOC
# There are still duplicates accross the two categories
pheno <- pheno %>%
select(CD71_NWGC_ID, Subject_ID, steadyState.vs.voc, Timepoint..at.RNA.collection., CD71_libprep_batch, Sex, Chronic.Pain.) %>%
group_by(steadyState.vs.voc) %>%
distinct(Subject_ID, .keep_all = T) %>%
ungroup() %>%
as.data.frame()
row.names(pheno) <- pheno$CD71_NWGC_ID
pheno$CD71_NWGC_ID <- NULL
#Change metadata to numeric for numeric features
#pheno$HGB <- as.numeric(sub(" .*", "", pheno$HGB))
# Select 10 random rows for each "Baseline" and "Inpatient VOC" group
#pheno <- pheno %>%
# select(CD71_NWGC_ID, Subject_ID, baseline.vs.voc, CD71_libprep_batch, Sex) %>%
# group_by(baseline.vs.voc) %>%
# distinct(Subject_ID, .keep_all = T) %>%
# sample_n(10) %>%
# ungroup() %>%
# as.data.frame()
# Change metadata character columns to factors
# This may need to be edited once more metadata features are used
pheno[] <- lapply(pheno, function(x) {
if (is.character(x)) {
as.factor(x)
} else {
x
}
})
pheno$Subject_ID <- as.factor(pheno$Subject_ID)
########### Prep gene count matrix ##########################
#gct <- read.table("cd71_tpm_counts.gct", header = T, check.names = F)
gct <- read.table(gzfile("../../pharmhu/aggregate_files/pharmhu_topmed_to5_rnaseq_gene_tpm.gct.gz"),
header = T, check.names = F, skip = 2)
#gct2 <- gct
#gct <- gct2
# Remove rows where gene name is a duplicate (why are they present?) and remove gene name column
gct <- gct[!duplicated(gct$Description),]
rownames(gct) <- gct$Description
gct <- gct[,-c(1,2)]
#gct <- gct[,-1] #Keeping for testing purposes so no need to unzip count matrix
# Remove ERCC transcripts
gct <- gct[!startsWith(row.names(gct), "ERCC-"), ]
# Remove sample not found in metadata and order the count matrix with respect to the metadata
cols_to_keep <- colnames(gct) %in% rownames(pheno)
gct <- gct[, cols_to_keep]
gct <- gct[, order(match(colnames(gct), rownames(pheno)))]
# Remove genes that are not expressed in at least 10% of samples
threshold <- 0.05
gct <- gct[which(rowSums(gct > 0)/ncol(gct) >= threshold), ]
# Remove genes where counts are zero for all samples
#gct <- gct[which(rowSums(gct) != 0),]
# Remove genes where count average is <= 1 TPM
gct <- gct[which(rowSums(gct)/ncol(gct) > .1), ]
#gct <- gct[which(rowSums(gct)/ncol(gct) > 1), ]
# Log2(1+X) transform the count matrix
gct <- apply(gct, 2, function(x) log2(x+1))
##################### PVCA with original covariates #################
covariates <- c("CD71_libprep_batch", "Sex", "steadyState.vs.voc", "Chronic.Pain.")
pct_threshold <- 0.75
pd <- new("AnnotatedDataFrame", data = pheno)
inpData <- ExpressionSet(assayData = gct, phenoData = pd)
# Make PVCA plot
pvcaPlot1 <- pvcAnaly(inpData, pct_threshold, covariates) #Need an "Expression Set" Object as input
##################### SVA Analysis ########################################
### Make formulas for modeling matrix in next step. Useful for when using large numbers of features
###########!!!!!!!!!!!!!!Issue occuring with the "%" sign in column names
#regform <- as.formula(paste("~ ", paste(names(pheno),collapse="+")))
#regform2 <- as.formula(paste("~ ", paste(names(pheno)[-6],collapse="+")))
#Example output: ~CD71 + CD71_Batch + SEX + RACE + ETHNICITY + HGB
### Model matrices
mod = model.matrix(~ CD71_libprep_batch + Sex + Chronic.Pain. + steadyState.vs.voc , data=pheno)
mod0 = model.matrix(~ CD71_libprep_batch + Sex + Chronic.Pain.,data=pheno)
### Determine number of surrogate variables to calculate manually
n.sv <- num.sv(gct,mod,method="leek")
#num.sv(gct,mod,method="be") #'be' method is older
### Run SVA; automatically determine number of surrogate variables to calculate
sva.out <- sva(gct,mod,mod0, n.sv = n.sv)
############## Determine relationship of surrogate variables to metadata ##########
# Add surrogate variables to metadata
pheno$sv1 <- sva.out$sv[,1]
#pheno$sv2 <- sva.out$sv[,2]
#pheno$sv3 <- sva.out$sv[,2]
# Fit a generalized linear model for to see what metadata features are associated
# with the surrogate variables.
# FIX: Automate removal of surrogate variables associated with metadata feature of interest
# from downstream analysis
glm.sv1 <- glm(sv1 ~ CD71_libprep_batch + Sex + steadyState.vs.voc, data = pheno)
#summary(glm.sv1)
coef(summary(glm.sv1))[,4]
# glm.sv2 <- glm(sv2 ~ CD71_libprep_batch + Sex + steadyState.vs.voc, data = pheno)
# summary(glm.sv2)
# glm.sv3 <- glm(sv3 ~ CD71_libprep_batch + Sex + steadyState.vs.voc, data = pheno)
# summary(glm.sv3)
############### PVCA including surrogate variables ##########################
#pheno2 <- pheno
#pheno <- pheno2
# Make surrogate variables categorical which is necessary for PVCA
# conTocat requires minimum of two elements in the list, so when there is only
# one surrogate variable, it can be added twice to the list and removed from the
# metadata later.
# Tried to use it on inpData, but wouldn't allow duplicate columns.
# Instead used on pheno, which was then used to make inpData variable
#var_names <- c("sv1", "sv2", "sv3")
var_names <- c("sv1", "sv1")
pheno<-conTocat(pheno, var_names)
# Only use when there is one surrogate variable; it will remove the extra column
# that was added.
pheno <- pheno[,-ncol(pheno)]
covariates <- c("CD71_libprep_batch", "Sex", "steadyState.vs.voc", "Chronic.Pain.", "sv1_cat")
pct_threshold <- 0.75
pd2 <- new("AnnotatedDataFrame", data = pheno)
inpData <- ExpressionSet(assayData = gct, phenoData = pd2)
# Make PVCA plot including surrogate variables
pvcaPlot2 <- pvcAnaly(inpData, pct_threshold, covariates)
# Make PVCA plot but include interactions between covariates
# pvcaObj <- pvcaBatchAssess(inpData, covariates, pct_threshold)
# bp <- barplot(pvcaObj$dat,
# ylab = "Variance explained",
# ylim= c(0,1.1),col = c("blue"), las=2,
# main="PVCA estimation bar chart")
# axis(1, at = bp, labels = pvcaObj$label, xlab = "Effects", cex.axis = 0.5, las=2)
# values = pvcaObj$dat
# new_values = round(values , 3)
# text(bp,pvcaObj$dat,labels = new_values, pos=3, cex = 0.8)
############################### SNM Analysis #####################################
# Create model matrices for the biological variables of interest and adjustment variables
bio.var.mod = model.matrix(~ steadyState.vs.voc , data=pheno)
adj.var.mod = model.matrix(~ Sex + CD71_libprep_batch + sv1 + Chronic.Pain., data = pheno)
# Normalize the count matrix using the biological variable and adjustment variable model matrices
# NOTE: This leave negative values in the matrix!
norm_counts = snm(gct, bio.var=bio.var.mod, adj.var=adj.var.mod, rm.adj=T)
head(summary(norm_counts))
#ks.test(norm_counts$pval, "punif")
# Save SNM normalized matrix
#saveRDS(norm_counts$norm.dat, file = "SNM_normalized_cd71_baselinevsvoc.gct.rds")
######### Enrichment Analysis ############################
library(qvalue)
# Calculate q-values and order from smallest to largest
q <- qvalue(norm_counts$pval)
deg <- data.frame("gene" = row.names(gct), "qval" = q$qvalues, "pval" = q$pvalues)
deg <- deg[order(deg$qval),]
number_of_DEG <- 3778
### Print top DEGs
# Below bonferroni threshold
# bonferroni_cutoff <- .05/nrow(gct)
# counter <- 0
# for (i in 1:nrow(deg)) {
# if(deg$pval[i]<bonferroni_cutoff){
# counter <- counter +1
# cat(deg$gene[i], "\n")
# }
# }
# Based on q-value
# for (i in 1:number_of_DEG) {
# cat(deg$gene[i], "\n")
# }
top_found_genes <- deg[1:number_of_DEG,"gene"]
#write.table(top_found_genes, file = "found_genes_12k_1e-04.txt", row.names = F, col.names = F, quote = F)