Skip to content

Commit

Permalink
Update FLAD1_R
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiang-yuZHAO authored Apr 18, 2024
1 parent ec30cc1 commit 763abc8
Showing 1 changed file with 0 additions and 325 deletions.
325 changes: 0 additions & 325 deletions script/FLAD1_R.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ library(reshape2)
library(ggplot2)
library(ggpubr)
library(ggthemes)
library(randomForest)
library(tidyverse)
library(skimr)
library(DataExplorer)
Expand Down Expand Up @@ -165,330 +164,6 @@ FLAD1_cnv1 <- FLAD1_cnv1[!duplicated(FLAD1_cnv1$id),]
FLAD1_cnv1 <- merge(FLAD1_cnv1, hyScore, by.x = "id", by.y = "patient_id")


#### figS3 ####
# * 1. 数据 ----
# 1. 代谢相关的基因集
hsa_path_gene <- keggLink("pathway", "hsa") %>%
tibble(pathway = ., eg = sub("hsa:", "", names(.))) %>%
mutate(
symbol = mapIds(org.Hs.eg.db, eg, "SYMBOL", "ENTREZID")
)

hsa_pathways <- keggList("pathway", "hsa") %>%
tibble(pathway = names(.), description = .)

hsa_path_gene$pathway <- gsub(pattern = "path:",
replacement = "",
x = hsa_path_gene$pathway)
path_gene <- merge(x = hsa_path_gene, y = hsa_pathways,
by = "pathway",
all.x = T)

path_gene$description <- gsub(pattern = " - Homo sapiens \\(human\\)",
replacement = "",
x = path_gene$description)
length(path_gene$symbol[!duplicated(path_gene$symbol)])
path_gene$description[!duplicated(path_gene$description)]

# https://www.genome.jp/kegg/pathway.html
kegg_metabolism <- readxl::read_excel(path = "~/FLAD1/Metabolism/kegg_metabolism.xlsx",
col_names = T)

kegg_metabolism <- merge(x = path_gene, y = kegg_metabolism,
by.x = "description", by.y = "1.0 Global and overview maps")


# 2. 代谢基因fpkm
df_mRNA1 <- lapply(names(df_mRNA), function(name) {
df <- df_mRNA[[name]]

df <- merge(x = probe,y = df, by.x = "id", by.y = "Ensembl_ID" )

df$mean <- apply(df[,-c(1,2)],1,mean)
df <- df[order(df$gene, -df$mean),]
df <- df[!duplicated(df$gene),]

row.names(df) <- df$gene
df <- df[,-which(colnames(df) %in% c("gene", "id", "mean"))]
df <- as.data.frame(t(df))

return(df)
})

df_mRNA2 <- do.call(rbind, df_mRNA1)
df_mRNA3 <- df_mRNA2[,colnames(df_mRNA2) %in% kegg_metabolism$symbol]

# 3. 肿瘤中上调基因
sample_type <- lapply(names(df_mRNA), function(name){
df <- df_mRNA[[name]]
df <- data.frame(
sample = colnames(df)[-1],
type = name
)
return(df)
})

sample_type <- do.call("rbind", sample_type)
sample_type$class <- ifelse(
substr(sample_type$sample,14,15) %in% c(paste0("0",1:9)), "tumor",
ifelse(
substr(sample_type$sample,14,15) %in% c(paste0("1",0:9)), "normal",
"other"
)
)

unique(sample_type$type) %in% cancerType
cancerType1 <- table(sample_type$type, sample_type$class)
cancerType1 <- as.data.frame(cancerType1)
cancerType1 <- as.character(cancerType1[cancerType1$Var2 == "normal" & cancerType1$Freq < 4,]$Var1) # 去除normal 太少的样本
cancerType1 <- cancerType[!cancerType %in% cancerType1][-3]


DEG_NT <- lapply(cancerType1, function(type){
tumorsample <- sample_type[sample_type$type == type & sample_type$class == "tumor",]$sample
normalsample <- sample_type[sample_type$type == type & sample_type$class == "normal",]$sample

tumorsample <- df_mRNA3[row.names(df_mRNA3) %in% tumorsample,]
normalsample <- df_mRNA3[row.names(df_mRNA3) %in% normalsample,]

df <- lapply(1:length(colnames(tumorsample)), function(x){
data.frame(
gene = colnames(tumorsample)[x],
pval = wilcox.test(tumorsample[,x], normalsample[,x])$p.value,
normalMean = mean(normalsample[,x]),
tumorMean = mean(tumorsample[,x])
)
})
df <- do.call("rbind", df)
df$type <- type
print(type)
return(df)
})

DEG_NT <- do.call("rbind", DEG_NT)
DEG_NT$foldchange <- DEG_NT$tumorMean / DEG_NT$normalMean
DEG_NT$log2fc <- log(x = DEG_NT$foldchange,base = 2)

DEG_NT1 <- DEG_NT[DEG_NT$normalMean != 0,]

DEG_NT1 <- DEG_NT1[DEG_NT1$type != "PAAD",]

res <- DEG_NT1 %>%
group_by(gene) %>%
summarise(frequency = sum(pval < 0.05 & log2fc > 0), .groups = 'drop')
res <- res[order(-res$frequency, res$gene),] # 肿瘤中显著上调的基因


# 4. 蛋白表达与mRNA水平一致基因
cor_RNA_protein # 数据 https://doi.org/10.1016/j.cell.2020.08.036
res1 <- merge(x = res, y = cor_RNA_protein, by.x = "gene", by.y = "Gene", all.x = T)
res1 <- res1 %>%
dplyr::filter(sig_indicator_for_Spearman_cor == "pos.sig") %>%
dplyr::arrange(desc(frequency))

# 5. 合并缺氧得分与df_mRNA3
df_mRNA4 <- df_mRNA3
df_mRNA4$patient_id <- row.names(df_mRNA4)
tumorSample <- df_mRNA4$patient_id[which(substr(df_mRNA4$patient_id,14,15) %in% c(paste0("0",1:9)))]
df_mRNA4 <- df_mRNA4[df_mRNA4$patient_id %in% tumorSample,]
df_mRNA4$patient_id <- gsub("-",".",substr(df_mRNA4$patient_id, 1, 12))
df_mRNA4 <- merge(hyScore[,c(1,3)], df_mRNA4, by = "patient_id") # 选择 buffa/wintr/ragnum 缺氧评分

df_mRNA4 <- df_mRNA4[,-1]
colnames(df_mRNA4)[1]
colnames(df_mRNA4)[1] <- "hyScore"


# 6. 随机森林数据集df_mRNA5
df_mRNA5 <- df_mRNA4[,c(1,which(colnames(df_mRNA4) %in% res1$gene[c(1:49)]))]
df_mRNA5 <- scale(df_mRNA5)
df_mRNA5 <- as.data.frame(df_mRNA5)


# * 2. 数据拆分 ----
set.seed(42)
trains <- createDataPartition(y = df_mRNA5$hyScore, p = 0.75, list = F)
traindata <- df_mRNA5[trains,]
testdata <- df_mRNA5[-trains,]

hist(traindata$hyScore,breaks = 20)
hist(testdata$hyScore,breaks = 20)

colnames(traindata)
form_reg <- as.formula(
paste0(
"hyScore ~",
paste(colnames(traindata)[2:length(colnames(traindata))], collapse = " + ")
)
)
form_reg

# * 3. 训练模型 ----
set.seed(42)
fit_rf_reg <- randomForest(
formula = form_reg,
data = traindata,
ntree = 500,
mtry = 6,
importance = T
)

fit_rf_reg
plot(fit_rf_reg, main = "ERROR & TREES")

a <- importance(fit_rf_reg)
a <- as.data.frame(a)
a$gene <- row.names(a)
a <- a[order(-a$IncNodePurity),]
a$gene <- factor(x = a$gene, levels = rev(a$gene))
# 分别使用winter, buffa, ragnum获得a_winter, a_buffa, a_ragnum

# * 4. 模型预测 ----
# 训练集
trainpred <- predict(fit_rf_reg, newdata = traindata)
defaultSummary(data.frame(obs = traindata$hyScore, pred = trainpred))

# 测试集预
testpred <- predict(fit_rf_reg, newdata = testdata)
defaultSummary(data.frame(obs = testdata$hyScore, pred = testpred))


# 训练集和测试集结果
preddresult <- data.frame(
obs = c(traindata$hyScore, testdata$hyScore),
pred = c(trainpred, testpred),
group = c(rep("Train", length(trainpred)),
rep("Test", length(testpred)))
)

# * 5. 结果展示 ----
# 1. 绘制相关性
cor_hypoxia <- hyScore[,2:4]
colnames(cor_hypoxia) <- c("Buffa", "Winter", "Ragnum")
corrplot.mixed(corr = cor(cor_hypoxia),
lower = "number", upper = "pie",
tl.pos = "d", tl.col = "black")

# 2. 绘制回归结果
anno_train <- defaultSummary(data.frame(obs = traindata$hyScore, pred = trainpred))
anno_test <- defaultSummary(data.frame(obs = testdata$hyScore, pred = testpred))

anno_train <- paste0("Train: ","RMSE = ", round(anno_train[1],3), " ",
"R2 = ", round(anno_train[2],3), " ",
"MAE = ", round(anno_train[3],3))
anno_test <- paste0("Test: ","RMSE = ", round(anno_test[1],3), " ",
"R2 = ", round(anno_test[2],3), " ",
"MAE = ", round(anno_test[3],3))

ggplot(preddresult,
aes(x = obs, y = pred, color = group, fill = group)) +
geom_point(size = 1,alpha = .6) +
geom_smooth(method = "lm", se = T) +
geom_abline(intercept = 0, slope = 1, size = 1.2 ) +
geom_xsidedensity(aes(y=stat(density)), alpha = .1) +
geom_ysidedensity(aes(x=stat(density)), alpha = .1) +
scale_x_continuous(breaks = seq(-2,2,1)) +
scale_color_manual(values = c(Train = "#FE817D", Test = "#81B8DF")) +
scale_fill_manual(values = c(Train = "#FE817D", Test = "#81B8DF")) +
scale_xsidey_continuous(breaks = seq(0,0.3,0.3)) +
scale_ysidex_continuous(breaks = seq(0,0.3,0.3)) +
xlab("Winter hypoxia score") +
ylab("Predicted hypoxia score") +
annotate(geom = "text", x = -0.7, y = 2,
label = anno_train,
color = "#FE817D", size = 6)+
annotate(geom = "text", x = -0.7, y = 1.7,
label = anno_test,
color = "#81B8DF", size = 6)+
theme_bw() +
theme(legend.position = "none",
ggside.panel.scale = 0.2,
axis.text.x = element_text(size = 30),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30, face = "bold"),
)

# 3. 热图展示不同基因的在不同肿瘤中的表达情况
getdf_sample_gene_hypoxia <- function(genename){
df <- df_mRNA3
df$patient_id <- row.names(df)
tumorSample <- df$patient_id[which(substr(df$patient_id,14,15) %in% c(paste0("0",1:9)))]
df <- df[df$patient_id %in% tumorSample,]

df$patient_id <- gsub("-",".",substr(df$patient_id, 1, 12))

df <- merge(hyScore[,c(1,4)], df, by = "patient_id") # Ragnum缺氧评分
colnames(df)[2] <- "hyScore"

df <- df[order(-df$hyScore),]

df <- df[,which(colnames(df) %in% genename)]


return(df)
}


scale_to_range <- function(x, min_range, max_range){
# 查找原始数据的最小值和最大值
x_min <- min(x)
x_max <- max(x)

# 缩放到[min_range, max_range]
scaled_x <- (x - x_min) / (x_max - x_min)

return(scaled_x)
}

phe_sample_gene <- getdf_sample_gene_hypoxia(genename = unique(a_total$gene))
phe_sample_gene <- as.data.frame(lapply(phe_sample_gene, scale_to_range, min_range = -1, max_range = 1))

phe_sample_gene <- phe_sample_gene[,match(
a_total[a_total$hyscore == "ragnum",]$gene,
colnames(phe_sample_gene))]


pheatmap(phe_sample_gene,
cluster_rows = F,
cluster_cols = F,
show_rownames = F,
show_colnames = T,
legend = T,
fontsize = 20,
angle_col = 90,
main = "Ragnum")
skim(phe_sample_gene)

# 5. 基因列表(贡献度图)
a_total <- rbind(a_buffa[1:20,], a_winter[1:20,], a_ragnum[1:20,])

a_total <- a_total[a_total$gene %in% intersect(a_ragnum$gene[1:20],
intersect(a_buffa$gene[1:20], a_winter$gene[1:20])),]

a_total$hyscore <- factor(x = a_total$hyscore,
levels = c("ragnum", "winter", "buffa"))
a_total$gene <- factor(x = a_total$gene,
levels = rev(a_ragnum$gene))

ggplot(a_total, aes(x = gene, y = IncNodePurity)) +
geom_segment(aes(x = gene, xend = gene, y = 0, yend = IncNodePurity), color="skyblue") +
geom_point(aes(size = `%IncMSE`), color="blue", alpha=0.6) +
theme_light() +
coord_flip() +
facet_grid(. ~ hyscore) +
scale_y_continuous(breaks = seq(0,1000,500)) +
theme_bw() +
theme(
legend.position="bottom",
axis.text.x = element_text(size = 20),
axis.text.y = element_text(size = 20),
axis.title.x = element_text(size = 20, face = "bold"),
axis.title.y = element_blank(),
strip.text = element_text(size = 20),
strip.background = element_rect(fill = "white", color = "white")
)



#### fig2 ####
# * 1. 计算基因与缺氧评分的相关性 ----
Expand Down

0 comments on commit 763abc8

Please sign in to comment.