Skip to content

Prediabetic plasma cohort

Shubham Gupta edited this page Sep 12, 2023 · 5 revisions

OpenSWATH

While running OpenMS (v2.6.0), extraction window was set to 600s for one-hour gradient as the libary was not setup-specific. The plasma library was modified with latest Uniprot IDs for the analysis.

OpenSwathWorkflow -in filename.mzML.gz -tr lib.pqp -tr_irt iRT.TraML -sort_swath_maps \
                  -out_osw filename.osw -out_chrom filename.chrom.mzML \
                  -threads 8 \
                  -readOptions cacheWorkingInMemory -tempDirectory /tmp \
                  -min_upper_edge_dist 1 \
                  -mz_extraction_window 67 -mz_extraction_window_unit ppm -mz_extraction_window_ms1 35 -mz_extraction_window_ms1_unit ppm \
                  -Scoring:DIAScoring:dia_extraction_window 67 -Scoring:DIAScoring:dia_extraction_unit ppm \
                  -enable_ms1 true -Scoring:Scores:use_ms1_mi true \
                  -rt_extraction_window 600 -extra_rt_extraction_window 50 \
                  -mz_correction_function quadratic_regression_delta_ppm \
                  -Scoring:TransitionGroupPicker:background_subtraction exact \
                  -Scoring:TransitionGroupPicker:PeakIntegrator:baseline_type vertical_division_min \
                  -Scoring:Scores:use_mi_score true -Scoring:TransitionGroupPicker:compute_total_mi -Scoring:Scores:use_total_mi_score

The chromatograms were converted to sqMass format using
OpenSwathMzMLFileCacher -in filename.chrom.mzML -out filename.chrom.sqMass -lossy_compression false -threads 8

PyProphet

Next, we scored the features using pyprophet (v2.1.0). Since, it is not possible to fit 949 runs in memory, we will scale-up the analysis by first subsample each run, and merge them. The merged run will be used to train the model for feature scoring.

# First subsample features from each run
for file in osw/*.osw ; do pyprophet subsample --in=$file --out=${file}s --subsample_ratio=0.002; done
pyprophet merge --template=lib.pqp --out=model.osw osw/*.osws
pyprophet score --in=model.osw --level=ms1ms2 --classifier=XGBoost --ss_main_score='var_library_rootmeansquare' --ss_initial_fdr=0.15 --ss_iteration_fdr=0.05 --pi0_lambda 0.05 0.99 0.02
for file in osw/*.osw
do 
  pyprophet score --in=$file --apply_weights=model.osw --level=ms1ms2 --classifier=XGBoost --ss_initial_fdr=0.15 --ss_iteration_fdr=0.05 --pi0_lambda 0.05 0.99 0.02
done

for file in osw/*.osw ; do pyprophet reduce --in=$file --out=${file}r; done
pyprophet merge --template=model.osw --out=model_global.osw osw/*.oswr
pyprophet peptide --context=experiment-wide --pi0_lambda 0.05 0.99 0.05 --in=model_global.osw
pyprophet peptide --context=run-specific --pi0_lambda 0.05 0.99 0.05 --in=model_global.osw
pyprophet peptide --context=global --pi0_lambda 0.05 0.99 0.05 --in=model_global.osw
pyprophet protein --context=experiment-wide --pi0_lambda 0.05 0.99 0.02 --in=model_global.osw
pyprophet protein --context=run-specific --pi0_lambda 0.05 0.99 0.02 --in=model_global.osw
pyprophet protein --context=global --pi0_lambda 0.05 0.99 0.02 --in=model_global.osw

The PyProphet thus provides two files: model.osw and model_global.osw, with feature scores and peptide+protein scores, respectively. We can export the peaks for each run here if alignment is not needed.

for file in osw/*.osw
do
  pyprophet export --in=$file --out=$file'.tsv' --format=legacy_merged --max_rs_peakgroup_qvalue 1.0 --max_global_peptide_qvalue 1.0 --max_global_protein_qvalue 0.1
done

DIAlignR

Since performing alignment for all peptides will be computationally expensive and unnecessary. We will only align peptides that we aim to use downstream, henceforth, we will create separate files with extracted chromatograms for 1% global FDR peptides. In our analysis, we identified xxx peptides which we further processed separately on ten computing nodes. These steps reduce the filesize and chromatograms can be kept at /localscratch of each node.

Fractionate peptides

# First get peptides at 1% FDR
con <- DBI::dbConnect(RSQLite::SQLite(), "model_global.osw")
query <- "SELECT PEPTIDE_ID, MODIFIED_SEQUENCE, QVALUE FROM SCORE_PEPTIDE
 INNER JOIN PEPTIDE ON PEPTIDE.ID = SCORE_PEPTIDE.PEPTIDE_ID
 WHERE SCORE_PEPTIDE.CONTEXT = 'global' AND SCORE_PEPTIDE.QVALUE<=0.01 AND PEPTIDE.DECOY=0"
peps <- DBI::dbGetQuery(con,query)
peps <- peps[order(peps$PEPTIDE_ID),]
write.table(peps, "PeptidesFDR.tsv", row.names = F, quote=F, sep = '\t')
peps <- peps$PEPTIDE_ID

Fetch chromtograms for the peptides and store them separately for each node. Repeat the code below for each run.

library(DIAlignR)
library(data.table)
fractions <- 10L
params <- paramsDIAlignR()
params[["fractionNum"]] <- fractions
oswIn="model_global.osw"
files <- list.files(path="xics", pattern="*.chrom.sqMass")
fileInfo <- getRunNames(".", FALSE, params)
fileInfo2 <- data.frame(fileInfo)
fileInfo2[["featureFile"]] <- oswIn
precursors <- getPrecursors(fileInfo2, FALSE, params[["runType"]], 'global', 1.0, params[["level"]])
x <- read.table("PeptidesFDR.tsv", sep="\t", header=TRUE)
peps <- x$PEPTIDE_ID
precursors <- precursors[peptide_id %in% peps, ]
setkeyv(precursors, c("peptide_id", "transition_group_id"))

for(fraction in 1:10){
  params[["fraction"]] <- fraction
  idx <- DIAlignR:::getPrecursorSubset(precursors, params)
  precursors2 <- precursors[idx[1]:idx[2],]
  peps2 <- unique(precursors2$peptide_id)
  nativeIDs <- getNativeIDs(oswIn, peps2, params, FALSE)
  xicFileIn <- paste0("xics/",file)
  xicFileOut<- paste0("xics2/frac",fraction,"/",file)
  reduceXICs(nativeIDs, xicFileIn, xicFileOut)
  print(fraction)
}
print(file)
print("Fractionations Done")

Step 0: Parameters

For multi-instrument analysis MST alignment is preferred. Since, each node have separate R environment, you need to copy the relevant files to each node and execute the code below.

library(DIAlignR)
library(BiocParallel)
library(data.table)
params <- paramsDIAlignR()
params[["hardConstrain"]] <- TRUE
params[["transitionIntensity"]] <- TRUE
params[["maxPeptideFdr"]] <- 0.1
params[["polyOrd"]] <- 3L
params[["fractionNum"]] <- 10L
params[["context"]] <- "experiment-wide"
scoreFile <- model_global.osw
oswMerged <- FALSE
x <- read.table("PeptidesFDR.tsv", sep="\t", header=TRUE)
peps <- x$PEPTIDE_ID
outFile <- "PlasmaRuns"

Step 1: Calculate MST

We first caluclate the tree and store features for parallel-processing across nodes. The calculated tree will be printed-out on the screen.

### EXECUTE STEP0 CODE
BiocParallel::register(BiocParallel::MulticoreParam(workers = 4, progressbar = TRUE))
mstScript1(".", outFile, params, oswMerged, applyFun = BiocParallel::bplapply)

Step 2: Align each fraction.

Copy the files on each computing node, and execute following code. Provide the fraction number you want to align.

### EXECUTE STEP0 CODE
mstNet <- "PRINTED I\n STEP 1"
fraction <- 7 # 1:fractionNum
params[["fraction"]] <- fraction
mstScript2(".", outFile, params, oswMerged, scoreFile, peps, mstNet)

Analyze DIAlignR output

Output tables can be concatnated with bash tools.

head -n 1 plasmaRuns_1_10.tsv > plasmaRuns.tsv
for file in plasmaRuns_*_10.tsv; do tail -n +2 $file; done >> plasmaRuns.tsv