proteomics.Rmd

---
title: "James P. Johnson - Proteomics LIMMA Regression - v3-7-23"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

```{r}
#Load some useful libraries
library(limma)
library(DESeq2)
library(edgeR)
library(data.table)
library(tidyr)
library(magrittr)
#library(tidyverse)
library(sjlabelled)
library(sjmisc)
library(ggplot2)
library(broom)
library(DataCombine)
library(dplyr)
library(ggpubr)
library(stringr)
#library(ggprism)
library(patchwork)
library(magrittr)
library(ggbeeswarm)
library(data.table)
```


```{r}
#import data
full <- read.csv(file = '../Final_Analysis/proteomics.csv')
#rownames(countdf) <- NULL
meta <- read.csv(file = '../Final_Analysis/proteomics_metadata_table.csv')
#rownames(meta) <- NULL
countdf <- read.csv(file = '../Final_Analysis/proteomics_count.csv')
#rownames(full) <- NULL

#clean-up tables
#prepare for model
#make consistent index/column/label/ordering between tables/matrices

# remove added X's from PCI names
names(countdf) <- sapply(str_remove_all(names(countdf),"X"),"[") 
names(meta) <- sapply(str_remove_all(names(meta),"X"),"[") 

#count_matrix <- countdf
#count_matrix <- count_matrix[,2:ncol(count_matrix)]
#count_matrix <- as.matrix(count_matrix)

names(full) <- sapply(str_remove_all(colnames(full),"X"),"[")
#names(countdf) <- sapply(str_remove_all(colnames(countdf),"X"),"[")
names(meta) <- sapply(str_remove_all(colnames(meta),"X"),"[")

#factorize sex
#meta$sex <- factor(meta$sex)
full$sex <- factor(full$sex)

#factorize BMF
#meta$bowel <- factor(as.numeric(factor(meta$bowel, levels = c(1,2,3,4), labels = c(1,2,3,4))))
full$bowel <- factor(as.numeric(factor(full$bowel, levels = c(1,2,3,4), labels = c(1,2,3,4))))
#meta <- within(meta, bowel <- relevel(bowel, ref = 3))
full <- within(full, bowel <- relevel(bowel, ref = 3))
#names(meta)[10] <- 'CRP'
names(full)[10] <- 'CRP'
#names(meta)[11] <- 'LDL'
names(full)[11] <- 'LDL'
#names(meta)[12] <- 'A1C'
names(full)[12] <- 'A1C'

#full <- DropNA(full)
#countdf <- DropNA(countdf)
#meta <- DropNA(meta)

#countdf[1] <- colnames(full)[13:ncol(full)]

rownames(countdf) <- countdf[[1]]
countdf[1] <- NULL
rownames(full) <- NULL
#colnames(countdf)[1] <- 'name'
#rownames(meta) <- NULL


#countdf[1] <- factor(countdf[1],countdf)

countdf
full
meta
```


```{r}
#design GLM using lmFit and eBayes
#Algorithm provided by and adapted from Christian Diener, PhD:
###########################################################################################################
design <- model.matrix(~bowel + sex + age + BMI_CALC + eGFR + PC1 + PC2 + PC3 + CRP + LDL + A1C, full)
#dge <- DGEList(counts=countdf)  # Where `count_matrix` is the matrix mentioned above
logCPM <- cpm(countdf, log=FALSE)  # Takes the log of the data
fit <- lmFit(logCPM, design)  # Fits the model for all metabolites
fit <- eBayes(fit)  # Stabilizes the variances
###########################################################################################################

```


```{r}
#Get results table for Constipation coefficient relative to High Normal BMF:
re_const <- topTable(fit, coef = 2, genelist = meta$name, sort="p", number="none")
names(re_const)[1] <- 'name' # rename first column from ID to CHEMICAL_ID to match other dfs
indices <- match(re_const$name, meta$name) # associate column of labs names with index of lab
re_const[1] <- rownames(countdf)[indices] # associate the labs names with the labs indices
names(re_const)[1] <- 'name'
re_const <- dplyr::inner_join(meta, re_const, by = intersect(names(meta),names(re_const))) #combine anno and re dfs by intersection
p <- re_const
p_aconst <- re_const[re_const$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_const <- p[,c('logFC','B','adj.P.Val','gene_name','P.Value','gene_description')] # keep only desired columns

#Get results table for Constipation coefficient relative to High Normal BMF:
re_low <- topTable(fit, coef = 2, genelist = meta$name, sort="p", number="none")
names(re_low)[1] <- 'name' # rename first column from ID to CHEMICAL_ID to match other dfs
indices <- match(re_low$name, meta$name) # associate column of labs names with index of lab
re_low[1] <- rownames(countdf)[indices] # associate the labs names with the labs indices
names(re_low)[1] <- 'name'
re_low <- dplyr::inner_join(meta, re_low, by = intersect(names(meta),names(re_low))) #combine anno and re dfs by intersection
p <- re_low
p_alow <- re_low[re_low$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_low <- p[,c('logFC','B','adj.P.Val','gene_name','P.Value','gene_description')] # keep only desired columns

#Get results table for Constipation coefficient relative to High Normal BMF:
re_diarrhea <- topTable(fit, coef = 2, genelist = meta$name, sort="p", number="none")
names(re_diarrhea)[1] <- 'name' # rename first column from ID to CHEMICAL_ID to match other dfs
indices <- match(re_diarrhea$name, meta$name) # associate column of labs names with index of lab
re_diarrhea[1] <- rownames(countdf)[indices] # associate the labs names with the labs indices
names(re_diarrhea)[1] <- 'name'
re_diarrhea <- dplyr::inner_join(meta, re_diarrhea, by = intersect(names(meta),names(re_diarrhea))) #combine anno and re dfs by intersection
p <- re_diarrhea
p_adiarrhea <- re_diarrhea[re_diarrhea$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_diarrhea <- p[,c('logFC','B','adj.P.Val','gene_name','P.Value','gene_description')] # keep only desired columns

#Get results table for Constipation coefficient relative to High Normal BMF:
re_egfr <- topTable(fit, coef = 8, genelist = meta$name, sort="p", number="none")
names(re_egfr)[1] <- 'name' # rename first column from ID to CHEMICAL_ID to match other dfs
indices <- match(re_egfr$name, meta$name) # associate column of labs names with index of lab
re_egfr[1] <- rownames(countdf)[indices] # associate the labs names with the labs indices
names(re_egfr)[1] <- 'name'
re_egfr <- dplyr::inner_join(meta, re_egfr, by = intersect(names(meta),names(re_egfr))) #combine anno and re dfs by intersection
p <- re_egfr
p_aegfr <- re_egfr[re_egfr$adj.P.Val < 0.05,] # create df of just significant adj P value results
p <- p[order(p$adj.P.Val),] # order by adj P value
p_egfr <- p[,c('logFC','B','adj.P.Val','gene_name','P.Value','gene_description')] # keep only desired columns

#Show dfs of significant hits
sig_const <- p_const[which(p_const$adj.P.Val < 0.05),]
sig_low <- p_low[which(p_low$adj.P.Val < 0.05),]
sig_diarrhea <- p_diarrhea[which(p_diarrhea$adj.P.Val < 0.05),]
sig_egfr <- p_egfr[which(p_egfr$adj.P.Val < 0.05),]
sig_const
sig_low
sig_diarrhea
sig_egfr
```


```{r}
#Prepare for plotting:
comparisons = list(c("Low Normal","High Normal"),c("Diarrhea","High Normal"))
sig_low['bowel'] <- rep('Low Normal',1)
sig_diarrhea['bowel']  <- rep('Diarrhea',25)
bound <- rbind(sig_low,sig_diarrhea)
a <- full[,names(full) %in% bound['name'][[1]]]
setcolorder(a,bound['name'][[1]])
names(a) <- bound['gene_name'][[1]]
biochemistry <- cbind(full[,1:6],a)
names(biochemistry)[11] <- 'TNFRSF11B (2nd Instance)'
biochemistry$bowel <- factor(biochemistry$bowel, levels=c(1,2,3,4), labels = c("Constipation","Low Normal","High Normal","Diarrhea"))
biochemistry$bowel <- factor(biochemistry$bowel, levels=c("Constipation","Low Normal","High Normal","Diarrhea"), labels = c("Constipation","Low Normal","High Normal","Diarrhea"))


#Annotation function:
sig = function(x){
  if(x < 0.001){"***"} 
  else if(x < 0.01){"**"}
  else if(x < 0.05){"*"}
  else{NA}}

#Plotting function:
test = function(x,y,z,j) {
  x = NULL
  y = NULL
  if (counter == 1) {
    z = comparisons[[1]][1]
  } else {
    z = comparisons[[2]][1]
    counter<<-0
  }
  print(j)
  if (str_detect(z,"Diarrhea") & any(sig_diarrhea[,'gene_name']==j)) {
    results = list(p.value = sig_diarrhea[which(sig_diarrhea[,'gene_name']==j),]['adj.P.Val'][[1]])
  } else if (str_detect(z,"Low Normal") & any(sig_low[,'gene_name']==j)) {
    results = list(p.value = sig_low[which(sig_low[,'gene_name']==j),]['adj.P.Val'][[1]])
  } else {
    results = list(p.value = 1)
  }
  names(results) <- 'p.value'
  counter <<-counter+1
  return(results)
}

#Begin accruing plots:
counter <<-1
myplots <- list()  # new empty list
  for (ind in 1:(ncol(biochemistry)-6)) {
  myplots[[ind]] <- local({
    label = paste(names(biochemistry)[ind+6],sep="")
    sub = ifelse(label!='TNFRSF11B (2nd Instance)',paste(bound['gene_description'][which(bound['gene_name']==label),]),paste(bound['gene_description'][which(bound['gene_name']=='TNFRSF11B'),]))
    print(label)
    print(sub)
    n = 2
    plotlim_lower = min(biochemistry[!is.na(biochemistry[,ind+6]),][,ind+6])
    plotlim_upper = max(biochemistry[!is.na(biochemistry[,ind+6]),][,ind+6])
    plotlim_bar = plotlim_lower - n
    plotlim_margin = abs(plotlim_bar - n*10)
    sublabel <- 'TNFRSF11B'
    plt <- ggplot(data = biochemistry, aes(x = bowel, y = .data[[label]], group = bowel)) +
    scale_x_discrete(guide = guide_axis(n.dodge = 2))+
    geom_beeswarm(aes(color = bowel), size = 0.1, cex = 0.5) +
    geom_boxplot(alpha=0.0,outlier.shape = NA) +
    theme(text = element_text(size = 9)) +
    ggtitle(label = str_wrap(label, width = 2),subtitle=str_wrap(sub,width=20)) +
    coord_cartesian(ylim=c(plotlim_lower,plotlim_upper),clip="off")+
    geom_signif(comparisons = comparisons, map_signif_level = sig, test = 'test', test.args = list(z = comparisons, j = ifelse(label!='TNFRSF11B (2nd Instance)',label,sublabel)), 
                y_position = plotlim_bar, 
                step_increase = 0.10,  size = 0.5, 
                textsize = 1.5,
                tip_length = c(0,0)) +
    labs(color = "BMF Category", y = ifelse((ind == 1 | ind == 6 | ind == 11 | ind == 16 | ind == 21 | ind == 26),"Protein Level",""))+
    guides(colour = guide_legend(override.aes = list(size=7), title.position = 'left', nrow = 1, ncol = 4)) +
    theme(plot.margin = unit(c(0,0,plotlim_margin,0), "pt"),  
          plot.title = element_text(size=5.75), 
          plot.subtitle = element_text(size=4.5), 
          legend.title = element_text(size=10),
          legend.text = element_text(size=7),
          axis.text.x = element_blank(), 
          axis.text.y = element_text(size=7), 
          axis.title.y = element_text(size=7),
          axis.title.x = element_blank(),
          aspect.ratio = 0.95)+
    scale_fill_manual(limits = c("Constipation","Low Normal","High Normal","Diarrhea"), labels = c("Constipation","Low Normal","High Normal","Diarrhea"), values = colors(),
                    drop = FALSE)
  print(plt)
  })
}

```


```{r}
#Arrange plots:
counter<<- 1
figure1 <- ggarrange(plotlist = myplots[1:10], labels = c(LETTERS[1:10]), legend = "top", align = "hv", font.label = list(size = 9.5), common.legend = TRUE, nrow = 2, ncol = 5, legend.grob = get_legend(myplots[[2]]))
                                                                                                             
figure1

counter <<- 1
figure2 <- ggarrange(plotlist = myplots[11:20], labels = c(LETTERS[11:20]), font.label = list(size = 9.5), legend = "none", align = "hv", nrow = 2, ncol = 5)


figure2


counter <<- 1
figure3 <- ggarrange(plotlist = myplots[21:26], labels = c(LETTERS[21:26]), font.label = list(size = 9.5), legend = "none", align = "hv", nrow = 2, ncol = 5)

figure3
```


```{r}
counter <<- 1
ggsave(
  "BMFvsProteins1.png",
  plot = figure1,
  device = NULL,
  path = NULL,
  scale = 1.5,
  width = NA,
  height = NA,
  units = c("in", "cm", "mm", "px"),
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)
counter <<- 1
ggsave(
  "BMFvsProteins2.png",
  plot = figure2,
  device = NULL,
  path = NULL,
  scale = 1.5,
  width = NA,
  height = NA,
  units = c("in", "cm", "mm", "px"),
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)
counter <<- 1
ggsave(
  "BMFvsProteins3.png",
  plot = figure3,
  device = NULL,
  path = NULL,
  scale = 1.5,
  width = NA,
  height = NA,
  units = c("in", "cm", "mm", "px"),
  dpi = 300,
  limitsize = TRUE,
  bg = NULL
)
```


Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.