Skip to content

Commit

Permalink
Now supports FF measures other than FFY
Browse files Browse the repository at this point in the history
  • Loading branch information
Lennart authored and Lennart committed Feb 22, 2019
1 parent 22b3b71 commit 363b698
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 41 deletions.
66 changes: 34 additions & 32 deletions PREFACE.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version = 'v0.0.2'
version = 'v0.1.0'

# ---
# Functions
Expand All @@ -7,7 +7,7 @@ version = 'v0.0.2'
print.help <- function(w = 'all'){
cat('\nUsage:\n')
if (w == 'all' | w == 'train'){
cat('\tRScript PREFACE.R train --config path/to/config.txt --outdir path/to/dir/ [--nfeat (int) --hidden (int) --cpus (int) --olm --noskewcorrect]\n')
cat('\tRScript PREFACE.R train --config path/to/config.txt --outdir path/to/dir/ [--nfeat (int) --hidden (int) --cpus (int) --femprop --olm --noskewcorrect]\n')
}
if (w == 'all' | w == 'predict'){
cat('\tRScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RData [–-json]\n')
Expand Down Expand Up @@ -155,13 +155,14 @@ train <- function(args){
args = args[args != out.dir]

## Optional
op.args <- c('--nfeat', '--hidden', '--olm', '--noskewcorrect', '--cpus')
op.args <- c('--nfeat', '--hidden', '--olm', '--femprop', '--noskewcorrect', '--cpus')

n.feat <- parse.op.arg(args, '--nfeat', 50)[[1]] ; args <- parse.op.arg(args, '--nfeat', 50)[[2]]
hidden <- parse.op.arg(args, '--hidden', 2)[[1]] ; args <- parse.op.arg(args, '--hidden', 2)[[2]]
cpus <- parse.op.arg(args, '--cpus', 1)[[1]] ; args <- parse.op.arg(args, '--cpus', 1)[[2]]
is.olm = F ; if ('--olm' %in% args) is.olm = T
skewcorrect = T ; if ('--noskewcorrect' %in% args) skewcorrect = F
train.gender = c('M') ; if ('--femprop' %in% args) train.gender = c('M', 'F')

## Others

Expand All @@ -177,8 +178,8 @@ train <- function(args){

config.file <- read.csv(file = config.file, sep = '\t', header = T,
comment.char='', colClasses = c('character', 'character', 'factor', 'numeric'))
if (length(which(config.file$gender == 'M')) < n.feat){
cat(paste0('Please provide at least ', n.feat, ' male samples.\n'))
if (length(which(config.file$gender %in% train.gender)) < n.feat){
cat(paste0('Please provide at least ', n.feat, ' labeled samples.\n'))
quit(save = 'no')
}

Expand Down Expand Up @@ -224,22 +225,23 @@ train <- function(args){

repeats = 10
test.percentage = 1/repeats
male.test.number = length(which(config.file$gender == 'M')) * test.percentage

test.number = length(which(config.file$gender %in% train.gender)) * test.percentage

oper <- foreach(i = 1:repeats) %dopar% {

cat(paste0('Model training | Repeat ', i,'/', repeats, ' ...\n'))

male.test.index.overall <- which(config.file$gender == 'M')[(as.integer((i-1)*male.test.number) + 1):as.integer(i*male.test.number)]
male.train.index.overall <- sort(which(config.file$gender == 'M')[!((which(config.file$gender == 'M')) %in% male.test.index.overall)])
male.train.index.subset <- sort(which(config.file$gender[-male.test.index.overall] == 'M'))
test.index.overall <- which(config.file$gender %in% train.gender)[(as.integer((i-1)*test.number) + 1):as.integer(i*test.number)]
train.index.overall <- sort(which(config.file$gender %in% train.gender)[!((which(config.file$gender %in% train.gender)) %in% test.index.overall)])
train.index.subset <- sort(which(config.file$gender[-test.index.overall] %in% train.gender))

cat(paste0('\tExecuting principal component analysis ...\n'))
pca.train <- prcomp(training.frame[-male.test.index.overall,])
X.train <- as.matrix(pca.train$x[male.train.index.subset, ])
Y.train <- as.matrix(config.file$FFY[male.train.index.overall], ncol = 1)
X.test <- as.matrix(scale(training.frame[male.test.index.overall,], pca.train$center, pca.train$scale) %*% pca.train$rotation)
Y.test <- as.matrix(config.file$FFY[male.test.index.overall], ncol = 1)
pca.train <- prcomp(training.frame[-test.index.overall,])
X.train <- as.matrix(pca.train$x[train.index.subset, ])
Y.train <- as.matrix(config.file$FF[train.index.overall], ncol = 1)
X.test <- as.matrix(scale(training.frame[test.index.overall,], pca.train$center, pca.train$scale) %*% pca.train$rotation)
Y.test <- as.matrix(config.file$FF[test.index.overall], ncol = 1)

if (is.olm){
cat(paste0('\tTraining ordinary linear model ...\n'))
Expand All @@ -257,7 +259,7 @@ train <- function(args){
prediction = as.numeric(compute(model, X.test[,1:n.feat])$net.result)
}

info <- plot.performance(prediction, Y.test, summary(pca.train), n.feat, 'PREFACE (%)', 'FFY (%)', paste0(out.dir, 'training_repeats/', 'repeat_', i,'.png'))
info <- plot.performance(prediction, Y.test, summary(pca.train), n.feat, 'PREFACE (%)', 'FF (%)', paste0(out.dir, 'training_repeats/', 'repeat_', i,'.png'))

results <- list()
results$intercept <- as.numeric(info[1])
Expand All @@ -274,7 +276,7 @@ train <- function(args){

if (skewcorrect){
p <- sample(length(predictions))[1:(length(predictions)/4)]
fit <- coef(lsfit(predictions[p], config.file$FFY[config.file$gender == 'M'][p]))
fit <- coef(lsfit(predictions[p], config.file$FF[config.file$gender %in% train.gender][p]))
the.intercept <- fit[1]
the.slope <- fit[2]
} else {
Expand All @@ -287,9 +289,9 @@ train <- function(args){
png(paste0(out.dir, 'FFX.png'), width=5, height=2.8, units='in', res=1024)
par(mar=c(2.7,2,0,0.3), mgp=c(1.5, 0.2, 0.2), mfrow=c(1,2), xpd = NA, oma=c(0,1.5,0,0))

v1 <- config.file$FFY[config.file$gender == 'M']
v1 <- config.file$FF[config.file$gender == 'M']
v2 <- X.ratios[config.file$gender == 'M']
plot(v1, v2, pch = 16, cex = 0.4, axes = F, xlab = 'FFY (%)', ylab = 'μ(ratio X)',
plot(v1, v2, pch = 16, cex = 0.4, axes = F, xlab = 'FF (%)', ylab = 'μ(ratio X)',
xlim = c(0, max(v1)))
fit <- rlm(v2 ~ v1)

Expand All @@ -316,7 +318,7 @@ train <- function(args){


v2 <- (v2 - fit[1]) / fit[2]
plot(v1, v2, pch = 16, cex = 0.4, axes = F, xlab = 'FFY (%)', ylab = 'FFX (%)',
plot(v1, v2, pch = 16, cex = 0.4, axes = F, xlab = 'FF (%)', ylab = 'FFX (%)',
xlim = c(0, max(v1)))
segments(min(v1), min(v1), max(v1), max(v1), lwd = 3, lty = 3, c = color.B)

Expand All @@ -335,8 +337,8 @@ train <- function(args){

cat(paste0('Executing final principal component analysis ...\n'))
pca.train <- prcomp(training.frame)
X.train <- as.matrix(pca.train$x[which(config.file$gender == 'M'), ])
Y.train <- as.matrix(config.file$FFY[which(config.file$gender == 'M')], ncol = 1)
X.train <- as.matrix(pca.train$x[which(config.file$gender %in% train.gender), ])
Y.train <- as.matrix(config.file$FF[which(config.file$gender %in% train.gender)], ncol = 1)

if (is.olm){
cat(paste0('Training final ordinary linear model ...\n'))
Expand All @@ -352,28 +354,28 @@ train <- function(args){
model <- train.neural(f, train.nn, hidden)
}

m.config.file <- config.file[config.file$gender == 'M', ]
train.config.file <- config.file[config.file$gender %in% train.gender, ]

info <- plot.performance(predictions, m.config.file$FFY, summary(pca.train),
n.feat, 'PREFACE (%)', 'FFY (%)', paste0(out.dir, 'overall_performance.png'))
info <- plot.performance(predictions, train.config.file$FF, summary(pca.train),
n.feat, 'PREFACE (%)', 'FF (%)', paste0(out.dir, 'overall_performance.png'))

index.10 <- which(m.config.file$FFY < 20 - predictions)
deviations.10 <- abs(predictions[index.10] - m.config.file$FFY[index.10])
index.10 <- which(train.config.file$FF < 20 - predictions)
deviations.10 <- abs(predictions[index.10] - train.config.file$FF[index.10])

deviations <- abs(predictions - m.config.file$FFY)
deviations <- abs(predictions - train.config.file$FF)
outliers.index <- which(deviations > info[4] + 3 * info[5])
outliers <- m.config.file$ID[outliers.index]
outliers <- train.config.file$ID[outliers.index]
outlier.values <- deviations[outliers.index]
outliers.values.noabs <- c(predictions - m.config.file$FFY)[outliers.index]
outliers.values.noabs <- c(predictions - train.config.file$FF)[outliers.index]

sink(paste0(out.dir, 'training_statistics.txt'))
cat('PREFACE - PREdict FetAl ComponEnt\n\n')
if (length(outliers) != 0){
cat(paste0('Below, some of the top candidates for outlier removal are listed.\n',
'If you know some of these have Y-aberrations, remove them from the config file and re-run.\n',
'Do not remove cases without being certain sex aneuploidies are present. If you do so, this will result in inaccurate performance statistics and possible overfitting towards unrelevant models.\n\n'))
'If you know some of these are low quality/have sex aberrations (when using FFY), remove them from the config file and re-run.\n',
'Avoid removing other cases, as this will result in inaccurate performance statistics and possible overfitting towards unrelevant models.\n\n'))
cat('_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n')
cat('ID\tFFY (%) - PREFACE (%)\n')
cat('ID\tFF (%) - PREFACE (%)\n')
for (i in rev(order(outlier.values))){
cat(paste0(outliers[i], '\t', outliers.values.noabs[i], '\n'))
}
Expand Down
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Each sample (whether it is used for training or for predicting) should be passed
- The possible values for 'chr' are 1 until 22 and X and Y (upper-case letters).
- 'ratio' corresponds to the log2-transformed ratio between the observed and expected copy number.
- Loci can be indeterminable (e.g. at repeats). Here, 'ratio' values should be expressed as 'NaN'.
- The order of rows does not matter. Yet, it is of paramount importance that, for a certain line, file X deals with the same locus as file Y. This implies, of course, that all copy number alteration files have the same number of lines.
- The order of rows does not matter. Yet, it is of paramount importance that, for a certain line, file x deals with the same locus as file y. This implies, of course, that all copy number alteration files have the same number of lines.
- PREFACE was validated using a bin size of 100 kb, lower bin sizes are expected to perform at least equally well.

### PREFACE's config.txt
Expand All @@ -25,11 +25,11 @@ For training, PREFACE requires a config file which contains every training case.

- Example: ```./examples/config.txt```
- Tab-separated file with at least four columns.
- The name of these columns (passed as a header) must be 'ID', 'filepath', 'gender' and 'FFY'.
- The name of these columns (passed as a header) must be 'ID', 'filepath', 'gender' and 'FF'.
- 'ID': a unique identifier should be given to each of the samples.
- 'filepath': the full absolute path of the copy number alteration files.
- 'gender': either 'M' (male) or 'F' (female), representing fetal gender. Twins/triplets/... can be included, yet they can only be labeled as 'M' if they are all male; if not, they should be 'F'.
- 'FFY': the fetal fraction according to the number of mapped Y-reads. One can use any formula he/she believes performs best at quantifying the actual fetal fraction. This measurement will be ignored for female feti.
- 'gender': either 'M' (male) or 'F' (female), representing fetal gender. Twins/triplets/... can be included if they are all male or female.
- 'FF': the fetal fraction. One can use any formula/method he/she believes performs best at quantifying the actual fetal fraction (PREFACE was benchmarked according to the number of mapped Y-reads, referred to as FFY). This measurement will be ignored for female feti during the supervised learning phase, unless the `--femprop` flag is given (see below).

## Model training

Expand All @@ -40,9 +40,10 @@ RScript PREFACE.R train --config path/to/config.txt --outdir path/to/dir/ [optio

<br>Optional argument <br><br> | Function
:--- | :---
`--nfeat x` | Number of principal components to use during modeling (default: x=50)
`--hidden x` | Number of hidden layers used in neural network. Use with caution (default: x=2)
`--cpus x` | Use for multiprocessing, number of requested threads (default: x=1)
`--nfeat x` | Number of principal components to use during modeling. (default: x=50)
`--hidden x` | Number of hidden layers used in neural network. Use with caution. (default: x=2)
`--cpus x` | Use for multiprocessing, number of requested threads. (default: x=1)
`--femprop` | When using FFY as FF (recommended), FF labels for female feti are unrelevant, and should be ignored in the supervised learning phase (default). If this is not desired, use this flag, which claims that the given FFs for female feti are proportional to their actual FF.
`--olm` | It might be possible the neural network does not converge; or for your kind of data/sample size, an ordinary linear model might be a better option. In these cases, use this flag.
`--noskewcorrect` | This flag ascertains the best fit for most (instead of all) of the data is generated.

Expand All @@ -65,7 +66,7 @@ RScript PREFACE.R predict --infile path/to/infile.bed --model path/to/model.RDat
- A 'non-random' phase (representing PCs that explain variance caused by naturally occurring Gaussian noise).
- An optimal `--nfeat` captures the 'random' phase (as shown in the example at `./examples/overall_performance.png`). Capturing too much of the 'non-random' phase could lead to convergence problems during modeling.
- If you are not satisfied with the concordance of your model or with the position of `--nfeat`, re-run with a different number of features.
- Note that the resulting model (the one that is written to the output) will probably be a bit more accurate than what is claimed by the statistics file and figures. This is because PREFACE uses a cross-validation strategy where 10% of the male samples are excluded from training, after which these 10% serve as validation cases. This process is repeated 10 times. Therefore, the final performance measurements are based on models trained with only 90% of the male feti, yet the resulting model is trained with all provided cases.
- Note that the resulting model (the one that is written to the output) will probably be a bit more accurate than what is claimed by the statistics file and figures. This is because PREFACE uses a cross-validation strategy where 10% of the (male) samples are excluded from training, after which these 10% serve as validation cases. This process is repeated 10 times. Therefore, the final performance measurements are based on models trained with only 90% of the (male) feti, yet the resulting model is trained with all provided cases.

# Required R packages

Expand Down
Binary file modified examples/FFX.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion examples/config.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ID filepath gender FFY
ID filepath gender FF
ID_0 /path/to/ID_0.bed M 10.470662487285
ID_1 /path/to/ID_1.bed F 0.0
ID_2 /path/to/ID_2.bed M 6.5079177218136
Expand Down
Binary file modified examples/overall_performance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 363b698

Please sign in to comment.