PRS_LDpred_BCA.R

#!/usr/bin/env Rscript
#To compute PRS on BCA samples using Validation_EA_genotyped.weight generated by PRS_LDpred_cambridge.sh
# generate data for LDpred,for imputed validation data 
#use Cambridge+BEACONcontrol as the validation, only work on EA
#BEACONcase+AMOS+other BEACONcontrol for discovery

library(data.table)
#generate disc/valid samples
sampletable=read_excel("/fh/fast/dai_j/BEACON/BEACON_GRANT/data/PLINKinputCombo_bca_07Feb2018.xls",1)
sampletable=as.data.frame(sampletable)
#site 30:Cambridge, 55:AMOS
idx=which(sampletable$site<30)
table(sampletable$phenoEA_bca[idx])
# -9    1    2 
# 2566 2185 1512
idx=which(sampletable$site==30)
table(sampletable$phenoEA_bca[idx])
# -9    2 
# 936 1003 
idx=which(sampletable$site==55)
table(sampletable$phenoEA_bca[idx])
# 1 
# 1022 
table(sampletable$site,sampletable$phenoEA_bca)
#      -9    1    2
# 11  174    0  102
# 12  303  214    0
# 13    2  116   63
# 14  822    0  502
# 15    0   88   42
# 16  202  218  193
# 17  332  323    0
# 18   36  259  247
# 19  101    0    0
# 20  160  167    0
# 21  122   92   54
# 22    2   26   14
# 23    8  437   59
# 25    0  245  236
# 27  302    0    0
# 30  936    0 1003
# 55    0 1022    0
# NA   15    0    0
#sites: control will be picked to combine with Cambridge to form validationset

EAmeta=as.data.frame(fread("../result/Discovery_Bonn_METAANALYSIS_EA_comsnp1.tbl"))
colnames(EAmeta)[which(colnames(EAmeta)=="P-value")]="P"
EABeacon=as.data.frame(fread("/fh/fast/dai_j/BEACON/BEACON_GRANT/result/EA_Discovery_BD_autosomes_comsnp_N.txt"))
idx=match(EAmeta$MarkerName,EABeacon$SNP)
EAmeta$chr=EABeacon$CHR[idx]
EAmeta$pos=EABeacon$position[idx]
# write.table(EAmeta,file="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/EA_Discovery_Bonn_imp_metastat.txt",col.names = T,
#             row.names=F,quote=F,sep="\t")
# cmd="gzip /fh/fast/dai_j/BEACON/BEACON_GRANT/result/EA_Discovery_Bonn_imp_metastat.txt"
# system(cmd)
# 

generate_plink=function(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/data/imputation/bca_1000g/bca_filter_noambiguous_NoNA",
                        opt="EA",prefix1="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/BCA_EA",metadat=EAmeta)
{
  #
  bim <- read.table(paste0(prefix,".bim"),stringsAsFactors = F)
  colnames(bim) <- c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")
  metadat$Allele1=toupper(metadat$Allele1) #Allele1 is non-effect
  metadat$Allele2=toupper(metadat$Allele2)
  metadat$A2=metadat$Allele1
  metadat$A1=metadat$Allele2
  colnames(metadat)[which(colnames(metadat)=="MarkerName")]="SNP"
  colnames(metadat)[which(colnames(metadat)=="chr")]="CHR"
  colnames(metadat)[which(colnames(metadat)=="pos")]="BP"
  #update bim SNP ID based on metadat
  bim_posname1=paste0(bim$CHR,":",bim$BP,"_",bim$B.A1,"_",bim$B.A2)
  bim_posname2=paste0(bim$CHR,":",bim$BP,"_",bim$B.A2,"_",bim$B.A1)
  meta_posname=paste0(metadat$CHR,":",metadat$BP,"_",metadat$Allele1,"_",metadat$Allele2)
  idx1=which(bim_posname1 %in% meta_posname)
  idx2=which(bim_posname2 %in% meta_posname)
  tmp=bim_posname1[idx1]
  idx3=match(tmp,bim_posname1)
  idx4=match(tmp,meta_posname)
  bim$SNP[idx3]=metadat$SNP[idx4]
  tmp=bim_posname2[idx2]
  idx3=match(tmp,bim_posname2)
  idx4=match(tmp,meta_posname)
  bim$SNP[idx3]=metadat$SNP[idx4]
  print(sum(bim$SNP %in% metadat$SNP))#number of overlapped SNPs
  write.table(bim,file=paste0(prefix1,".bim"),col.names = F,row.names = F,sep=" ",quote=F)
  
  
  for (ext in c(".bed",".fam"))
  {
    #file.copy(paste0(prefix,ext),paste0(prefix1,ext))
    cmd=paste0("cp ",paste0(prefix,ext)," ",paste0(prefix1,ext))
    system(cmd,wait = T)
  }
  fam=read.table(paste0(prefix1,".fam"))
  # idx=which(!fam$V2 %in% sampletable$localid)
  # tmp=fam[idx,c(1,2)]
  # write.table(tmp,file="../result/BCA_NAsamples_plink.txt",row.names = F,col.names = F,sep="\t",quote=F)
  idx=match(fam$V2,sampletable$localid)
  sum(is.na(idx))
  #"NA10831" "NA12155" "NA12156" "NA10838" "NA12003" "NA12004"
  
  if (opt=="BE")
  {
    fam$V6=sampletable$phenoBE_bca[idx]
  }
  if (opt=="EA")
  {
    fam$V6=sampletable$phenoEA_bca[idx]
  }
  if (opt=="BEEA")
  {
    fam$V6=sampletable$phenoEABE_bca[idx]
  }
  write.table(fam,file=paste0(prefix1,".fam"),row.names = F,col.names = F,sep=" ",quote=F)
  #generate phenotype file
  tmp=fam[,c(1,2,6)]
  write.table(tmp,file=paste0(prefix1,".pheno"),row.names = F,col.names = F,sep=" ",quote=F)
}

generate_plink()


generate_covariate=function(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/BCA_EA")
{
  fam=read.table(paste0(prefix,".fam"),stringsAsFactors = F)
  idx=match(fam$V2,sampletable$localid)
  Covariate=sampletable[idx,colnames(sampletable) %in% c("sex","age","ev1_bca","ev2_bca","ev3_bca","ev4_bca")]
  colnames(Covariate)[which(colnames(Covariate)=="ev1_bca")]="pc1"
  colnames(Covariate)[which(colnames(Covariate)=="ev2_bca")]="pc2"
  colnames(Covariate)[which(colnames(Covariate)=="ev3_bca")]="pc3"
  colnames(Covariate)[which(colnames(Covariate)=="ev4_bca")]="pc4"
  rownames(Covariate)=sampletable$localid[idx]
  Covariate$age[is.na(Covariate$age)]=mean(Covariate$age,na.rm = T)
  Covariate=data.frame(IID=rownames(Covariate),Covariate)
  write.table(Covariate,file=paste0(prefix,".covariate"),col.names = T,row.names = F,sep=" ",quote=F)
}
#generate_covariate()
generate_covariate()
#generate_covariate(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/Beacon_BEEA")


#after runing PRS_LDpred.sh, read results:
#library(DescTools)
#p.threshold <- c(1,0.3,0.1)
p.threshold <- c(1,0.3,0.1,0.03,0.01,0.003,0.001,0.0003,0.0001)
# Read in the covariates
# read_LDpred=function(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/BCA_EA")
# {
#   # Read in the phenotype file 
#   phenotype <- read.table(paste0(prefix,".pheno"), header=F)
#   colnames(phenotype)=c("FID","IID","case")
#   
#   
#   covariate <- read.table(paste0(prefix,".covariate"), header=T,stringsAsFactors = F)
#   # Now merge the files
#   pheno <- merge(phenotype, covariate, by=c("IID"))
#   # We can then calculate the null model (model with PRS) using a linear regression 
#   
#   null.model <- glm(I(case==1)~., data=pheno[,!colnames(pheno)%in%c("FID","IID")],family = "binomial")
#   null.model1 <- glm(I(case==1)~1, data=pheno[,!colnames(pheno)%in%c("FID","IID")],family = "binomial")
#   # And the R2 of the null model is 
#   null.r2 <- 1-logLik(null.model)/logLik(null.model1)
#   prs.result <- NULL
#   for(i in p.threshold){
#     # Go through each p-value threshold .score_LDpred_p1.0000e-01.txt
#     tmp=formatC(i, format = "e", digits =4)
#     LDpredfile=paste0(prefix,".score_LDpred_p",tmp,".txt")
#     if (file.exists(LDpredfile))
#     {
#       #prs <- read.table(paste0(prefix,".score_LDpred_p",tmp,".txt"), header=T,sep=",")
#       prs <- read.table(paste0(prefix,".score_LDpred_p",tmp,".txt.adj"), header=T,sep=",")
#       # Merge the prs with the phenotype matrix
#       # We only want the FID, IID and PRS from the PRS file, therefore we only select the 
#       # relevant columns
#       pheno.prs <- merge(pheno, prs, by=c("IID"))
#       # Now perform a linear regression on Height with PRS and the covariates
#       # ignoring the FID and IID from our model
#       
#       model <- glm(I(case==1)~., data=pheno.prs[,!colnames(pheno.prs)%in%c("FID","IID","true_phens","cov_prs")],family = "binomial")
#       model1 <- glm(I(case==1)~1, data=pheno.prs[,!colnames(pheno.prs)%in%c("FID","IID","true_phens","cov_prs")],family = "binomial")
#       # model R2 is obtained as 
#       model.r2 <- 1-logLik(model)/logLik(model1)
#       # R2 of PRS is simply calculated as the model R2 minus the null R2
#       prs.r2 <- model.r2-null.r2
#       #prs.r2=1-logLik(model)/logLik(null.model)
#       # We can also obtain the coeffcient and p-value of association of PRS as follow
#       prs.coef <- summary(model)$coeff["PRS",]
#       #prs.coef <- summary(model)$coeff["cov_prs",]
#       prs.beta <- as.numeric(prs.coef[1])
#       prs.se <- as.numeric(prs.coef[2])
#       prs.p <- as.numeric(prs.coef[4])
#       # We can then store the results
#       prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
#     }
#   }
#   # Best result is:
#   prs.result[which.max(prs.result$R2),]
#   return(prs.result)
# }
# 
# plot_LDpred=function(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/BCA_EA")
# {
#   prs.result=read_LDpred(prefix=prefix)
#   #pdf(paste0(prefix,".LDpred.pdf"),width=12,height=8)
#   png(paste0(prefix,".LDpred.png"), height=10, width=10, res=300, unit="in")
#   # First, obtain the colorings based on the p-value
#   col <- suppressWarnings(colorRampPalette(c("dodgerblue", "firebrick")))
#   # We want the color gradient to match the ranking of p-values
#   prs.result <- prs.result[order(-log10(prs.result$P)),]
#   prs.result$color <-  col(nrow(prs.result))
#   prs.result <- prs.result[order(prs.result$Threshold),]
#   # generate a pretty format for p-value output
#   prs.result$print.p <- round(prs.result$P, digits = 3)
#   prs.result$print.p[!is.na(prs.result$print.p) & prs.result$print.p == 0 ] <-
#     format(prs.result$P[!is.na(prs.result$print.p) & prs.result$print.p == 0 ], digits = 2)
#   prs.result$print.p <- sub("e", "*x*10^", prs.result$print.p)
#   # Generate the axis labels
#   xlab <- expression(italic(P) - value ~ threshold ~ (italic(P)[T]))
#   ylab <- expression(paste("PRS model fit:  ", R ^ 2))
#   # Setup the drawing area
#   layout(t(1:2), widths=c(8.8,1.2))
#   par( cex.lab=1.5, cex.axis=1.25, font.lab=2, 
#        oma=c(0,0.5,0,0),
#        mar=c(4,6,0.5,0.5))
#   # Plotting the bars
#   b<- barplot(height=prs.result$R2, 
#               col=prs.result$color, 
#               border=NA, 
#               ylim=c(0, max(prs.result$R2)*1.25), 
#               axes = F, ann=F)
#   # Plot the axis labels and axis ticks
#   odd <- seq(0,nrow(prs.result)+1,2)
#   even <- seq(1,nrow(prs.result),2)
#   axis(side=1, at=b[odd], labels=prs.result$Threshold[odd], lwd=2)
#   axis(side=1, at=b[even], labels=prs.result$Threshold[even],lwd=2)
#   axis(side=1, at=c(0,b[1],2*b[length(b)]-b[length(b)-1]), labels=c("","",""), lwd=2, lwd.tick=0)
#   # Write the p-value on top of each bar
#   text( parse(text=paste(
#     prs.result$print.p)), 
#     x = b+0.1, 
#     y =  prs.result$R2+ (max(prs.result$R2)*1.05-max(prs.result$R2)), 
#     srt = 45)
#   # Now plot the axis lines
#   box(bty='L', lwd=2)
#   axis(2,las=2, lwd=2)
#   # Plot the axis titles
#   title(ylab=ylab, line=4, cex.lab=1.5, font=2 )
#   title(xlab=xlab, line=2.5, cex.lab=1.5, font=2 )
#   # Generate plot area for the legend
#   par(cex.lab=1.5, cex.axis=1.25, font.lab=2, 
#       mar=c(20,0,20,4))
#   prs.result <- prs.result[order(-log10(prs.result$P)),]
#   image(1, -log10(prs.result$P), t(seq_along(-log10(prs.result$P))), col=prs.result$color, axes=F,ann=F)
#   axis(4,las=2,xaxs='r',yaxs='r', tck=0.2, col="white")
#   # plot legend title
#   title(bquote(atop(-log[10] ~ model, italic(P) - value), ), 
#         line=2, cex=1.5, font=2, adj=0)
#   dev.off()
#   return(prs.result)
# }
library(pROC)
sampletable=readxl::read_excel("/fh/fast/dai_j/BEACON/BEACON_GRANT/data/PLINKinputCombo_bca_07Feb2018.xls",1)
sampletable <- data.frame(sampletable)
plot_ROC=function(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/Beacon_BE_genotyped",opt=1)
{
  phenotype <- read.table(paste0(prefix,".pheno"), header=F)
  colnames(phenotype)=c("FID","IID","case")
  if (opt==1)
  {
    for(i in p.threshold)
    {
      # Go through each p-value threshold .score_LDpred_p1.0000e-01.txt
      tmp=formatC(i, format = "e", digits =4)
      LDpredfile=paste0(prefix,".score_LDpred_p",tmp,".txt")
      if (file.exists(LDpredfile))
      {
        prs <- read.table(LDpredfile, header=T,sep=",",stringsAsFactors = F)
        pheno.prs <- merge(phenotype, prs, by=c("IID"))
        pheno.prs$true_phens[pheno.prs$true_phens==-9]=NA
        pheno.prs$case[pheno.prs$case==-9]=NA
        idx=match(pheno.prs$IID,sampletable$localid)
        sampletable1=sampletable[idx,]
        sampletable1$prs=pheno.prs$PRS
        sampletable1$case=pheno.prs$case
        fit1 <- glm(I(case==2)~prs,family=binomial,data=sampletable1[sampletable1$site<=30,],y=T)
        #testroc<- roc(pheno.prs$true_phens, pheno.prs$PRS)
        
        fit3 <- glm(I(case==2)~age+sex+recurrent_HB_RF+bmi_recent_healthy+cig_smk_ever+nsaid_ever+prs,family=binomial,data=sampletable1[sampletable1$site<=30,],y=T)
        roc1=roc(fit1$y,fit1$fitted.values)
        roc3=roc(fit3$y,fit3$fitted.values)
        print(paste0("p=",i,":auc_prs=",round(roc1$auc,3)," auc_env=",round(roc3$auc,3)))
        #plot(testroc, print.auc=TRUE,main=tmp)
      }
    }
  }
  
  if (opt==2) #LDpred-inf
  {
    LDpredfile=paste0(prefix,".score_LDpred-inf.txt")
    prs <- read.table(LDpredfile, header=T,sep=",",stringsAsFactors = F)
    pheno.prs <- merge(phenotype, prs, by=c("IID"))
    testroc<- roc(pheno.prs$true_phens, pheno.prs$PRS)
    plot(testroc, print.auc=TRUE,main="inf")
  }
  
}

plot_ROC(prefix="/fh/fast/dai_j/BEACON/BEACON_GRANT/result/BCA_EA")