-
Notifications
You must be signed in to change notification settings - Fork 0
/
pub.R
executable file
·479 lines (389 loc) · 15.1 KB
/
pub.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
# |=== pub.R ===|
# Name stands for "Pick Up Barcodes".
# The script is designed for automatic picking up sequenceing barcodes.
# pub.R uses pam algorithm (from 'cluster' package) to cluster barcodes and select ones that
# are the most dissimilar from others.
script.version <- "1.1.a"
# Last modified 2020.10.23
######################
# Parse CL arguments #
######################
usage.msg <- "pub.R\nUsage:\n Rscript pub.R <csv_file_with_barcodes> <number_of_samples>"
# Print help message
if (commandArgs(T)[1] %in% c("-h", "-help", "--help")) {
cat(paste0("pub.R; Version ", script.version, '\n'))
cat(paste0(usage.msg, '\n'))
cat("Example:\n")
cat("Pick up 28 barcodes from file `my_favorite_barcodes.csv`.\n")
cat("\n Rscript pub.R my_favorite_barcodes.csv 28\n")
quit(save = "no")
}
if (length(commandArgs(T)) != 2) {
cat(paste0(usage.msg, '\n'))
quit(save = "no", status = 1)
}
barcodes.fpath <- commandArgs(T)[1]
num.samples <- commandArgs(T)[2]
if (! file.exists(barcodes.fpath)) {
stop(paste0("File '", barcodes.fpath, "' does not exist!"))
}
if (! grepl("^[0-9]+$", num.samples)) {
stop(paste0("Invalid number of samples: '", num.samples, "'"))
} else {
num.samples <- as.numeric(num.samples)
}
cat(paste0("pub.R; Version ", script.version, '\n\n'))
if (! require(cluster)) {
cat("Package 'cluster' is not installed!\n")
cat("You can install it with 'install.packages(\"cluster\")'\n")
stop()
}
#########################################
# Define functions and data strunctures #
#########################################
# There cannot be other IUPAC characters in barcode sequences:
bases <- "AGCT"
# == Function 'check.structure' ==
# Function checks if input file is "single" or "double".
# Moreover, it checks if there is a header in the table.
# :param fpath: path to input file;
# :type fpath: character;
# Returns vector of logical values:
# c(<IS_THERE_A_HEADER_IN_FILE>, <IS_FILE_IN_DOUBLE_FORMAT>)
check.structure <- function(fpath) {
# Read the first line of file
conn <- file(fpath)
# Get rid of leading and trailing whitespaces
line <- trimws(readLines(conn, n = 1), which = "both")
close(conn)
# Count commas (1 comma - "single" format, 3 commas -- "double" format)
num.commas <- length( unlist(regmatches(line, gregexpr(',', line))) )
if (num.commas != 1 && num.commas != 3) {
stop("Unrecognized file structure. Please, see manual.")
}
# Prepare regexp patterns for format recognition
single.patt <- paste0("^.+,[", bases, "]+$")
double.patt <- paste0("^.+,[", bases, "]+,.+,[", bases, "]+$")
# Detect format
if ( grepl(single.patt, line, ignore.case = T) ) {
return( c(FALSE, FALSE) ) # no header, "single format"
} else if ( grepl(double.patt, line, ignore.case = T) ) {
return( c(FALSE, TRUE) ) # no header, "double format"
} else {
double <- num.commas == 3 # TRUE if file in "double" format
return( c(TRUE, double) ) # there is a header
}
}
# == Function 'read.barcodes' ==
# Function reads a dataframe from input file.
# :param barcodes.fpath: path to input file;
# :type fpath: character;
# :param header: logical value indicating whether there is
# a header in input file. TRUE if there is one;
# :type header: logical;
# :param double: logical value indicating whether input file
# is in "double" format. TRUE if it is in "double" format;
# :type double: logical;
# Returns read dataframe with trimmed values.
read.barcodes <- function(barcodes.fpath, header, double) {
# Define column names
if (! double) {
col.names <- c("Idx_name", "Idx_seq")
} else {
col.names <- c("Idx_name_F", "Idx_seq_F", "Idx_name_R", "Idx_seq_R")
}
# Read file and get dataframe
barcodes.df <- read.csv(barcodes.fpath,
header = header,
col.names = col.names,
colClasses = "character",
comment.char = '#') # we want to skip commented lines
# Get rid of whitespaces
for (i in 1:length(barcodes.df)) {
barcodes.df[[i]] <- trimws(barcodes.df[[i]], which = "both")
}
# Prepare regexp pattern for invalid barcodes recognition
bases.patt <- paste0("^[", bases, "]+$")
# Check if add barcodes are DNA sequences.
# And bring them to upper case just in case.
for (i in seq(2, length(barcodes.df), by = 2)) {
barcodes.df[[i]] <- toupper(barcodes.df[[i]]) # to upper case
if (! all( grepl(bases.patt, barcodes.df[[i]]) )) {
stop("Your barcodes do not look line DNA sequences.")
}
}
return(barcodes.df)
}
# == Quite obvious distance matrix ==
base.dists <- matrix(
c(
# A C G T
0.0, 1.0, 1.0, 1.0, # A
1.0, 0.0, 1.0, 1.0, # C
1.0, 1.0, 0.0, 1.0, # G
1.0, 1.0, 1.0, 0.0 # T
),
byrow = TRUE, nrow = nchar(bases), ncol = nchar(bases),
dimnames = list(unlist(strsplit(bases, "")), unlist(strsplit(bases, "")))
)
# == Function 'get.base.dist' ==
# Function returns distance between two bases passed to it.
# :param base1: the first base to compare;
# :type base1: character;
# :param base2: the second base to compare;
# :type base2: logical;
# Returns distance between bases.
get.base.dist <- function(base1, base2) {
return(base.dists[base1, base2])
}
# == Function 'seqdist' ==
# Function calculates distance between two barcode sequences.
# This distance if defined as sum of distances between pairwise bases in sequences.
# :param seq1: the first sequence to compare;
# :type seq1: character;
# :param seq2: the second sequence to compare;
# :type seq2: character;
# Returns distance between sequences.
seqdist <- function(seq1, seq2) {
# Sequences will be cropped to min length
min.len <- min(nchar(seq1), nchar(seq2))
# Crop and split sequences (because R cannot iterate over strings)
seq1 <- unlist(strsplit( substr(seq1, 1, min.len), "" ))
seq2 <- unlist(strsplit( substr(seq2, 1, min.len), "" ))
# Calculate distance
dist <- sum( mapply(get.base.dist, seq1, seq2) )
return(dist)
}
# == Function 'calc.diss.matr' ==
# Function calculates dissimilarity matrix for set of barcodes.
# :param barcodes.df: dataframe with names of barcodes and their sequences;
# :type barcodes.df: dataframe;
# Returns dissimilarity matrix.
calc.diss.matr <- function(barcodes.df) {
# Dimensions
n <- nrow(barcodes.df)
n.sq <- n * n
# Initialize with zeros
diss.matr <- matrix(rep(0, n.sq),
nrow = n, ncol = n,
# Names of barcodes will be rownames and colnames
dimnames = list(barcodes.df[[1]], barcodes.df[[1]]))
# Shameful C-style loop for calculating distances.
# Dissimilarity matrix is symmetric -- we won't recalculate already calculated values.
for (i in 2:n) {
for (j in 1:(i-1)) {
diss.matr[i, j] <- seqdist(barcodes.df$Idx_seq[i], barcodes.df$Idx_seq[j])
diss.matr[j, i] <- diss.matr[i, j] # just copy -- matrix is symmertic
}
}
return(diss.matr)
}
# |=== Functions for checking color ballance ===|
red.group <- c('C', 'A') # "red" bases
green.group <- c('G', 'T') # "green" bases
# == Function is.ballanced ==
# Function checks if set of sequences are color-ballanced,
# See this page for details: https://international.neb.com/protocols/2016/03/23/index-pooling-guidelines-neb-e6609
# :param seqs: vector of sequences to check for color ballance;
# :type seqs: vector<character>;
# Returns TRUE if set of sequences is color-ballanced, else FALSE.
is.ballanced <- function(seqs) {
# Find min length among all sequences
min.len <- min(as.vector(sapply(X = seqs, FUN = nchar)))
for (i in 1:min.len) {
# Select i-th characters of all sequences
chars <- as.vector(sapply(X = seqs, FUN = substr,
start = i, stop = i)) # arguments for substr
# Check if there are at least one nucleotide from each group
red <- any(red.group %in% chars)
green <- any(green.group %in% chars)
if (! (red && green)) {
return(FALSE) # color ballance is corrupted
}
}
return(TRUE)
}
# == Function get.forw.names ==
# Returns returns name of forward barcode (first in pair)
# given my ad hoc naming scheme: "<forw_name> -- <rev_name>"
# :param combined.name: name of "double" barcodes divided by ' -- ';
# :type combined.name: character;
# Returns returns name of forward barcode.
get.forw.names <- function(combined.name) {
return(strsplit(combined.name, " -- ")[[1]][1])
}
# == Function get.rev.names ==
# Returns returns name of reverce barcode (second in pair)
# given my ad hoc naming scheme: "<forw_name> -- <rev_name>"
# :param combined.name: name of "double" barcodes divided by ' -- ';
# :type combined.name: character;
# Returns returns name of reverce barcode.
get.rev.names <- function(combined.name) {
return(strsplit(combined.name, " -- ")[[1]][2])
}
# == Function get.seqs ==
# Function finds sequences of barcodes in 'barcodes.df'
# given their names.
# :param barcode.names: vector of barcode names;
# :type barcode.names: vector<character>;
# :param barcodes.df: dataframe with names of barcodes and their sequences;
# :type barcodes.df: dataframe;
# :param name.i: integer indicating whether it is forward or reverse barcode.
# 1 for forward (and "single"), 3 for reverse;
# :type name.i: numeric;
get.seqs <- function(barcode.names, barcodes.df, name.i) {
# Prepare vector for sequences
seqs <- rep(NA, length(barcode.names))
found.seqs <- 0 # just counter
# Find corresponding sequences
for (i in 1:nrow(barcodes.df)) {
if (barcodes.df[[name.i]][i] %in% barcode.names) {
found.seqs <- found.seqs + 1
seqs[found.seqs] <- barcodes.df[[name.i+1]][i]
}
}
return(unique(seqs))
}
# == Function check.color.ballance ==
# Function checks color ballance of picked up barcodes.
# :param barcodes.df: dataframe with names of barcodes and their sequences;
# :type barcodes.df: dataframe;
# :param medoids: vector of named of picked up barcodes;
# :type medoids: vector<character>;
# :param double: logical value. TRUE if input file is "double". Else TRUE;
# :type double: logical;
check.color.ballance <- function(barcodes.df, medoids, double) {
# Naive assumption :-)
ballance <- TRUE
if (! double) {
# It is all simple in "single" mode:
seqs <- get.seqs(medoids, barcodes.df, 1) # find sequences
ballance <- is.ballanced(seqs) # validate them for color ballance
} else {
# Ad hoc trick: name.i is 1 for "single" and 3 for "double" mode.
# We'll just skip the 2-nd element in this vector in
# order not to create new variable and use name.i for indexing.
get.names.funcs <- c(get.forw.names, NA, get.rev.names)
for (name.i in c(1, 3)) {
# Parse names of barcodes
barcode.names <- as.vector(sapply(X = medoids,
FUN = get.names.funcs[[name.i]]))
# Fin sequences
seqs <- get.seqs(barcode.names, barcodes.df, name.i)
# Validate them for color ballance
ballance <- ballance && is.ballanced(seqs)
}
}
return(ballance)
}
###########
# Proceed #
###########
# Check file struncture
header.double <- check.structure(barcodes.fpath)
# Get rid of vector -- we need just variables
header <- header.double[1]
double <- header.double[2]
rm(header.double) # remove!
# Print some message
if (double) {
message("Input file is in \"double\" format.")
} else {
message("Input file is in \"single\" format.")
}
# Read barcode dataframe
barcodes.df <- read.barcodes(barcodes.fpath, header, double)
# Check if there is enough barcodes in file
if (nrow(barcodes.df) < num.samples) {
cat("\n")
err.msg <- "Specified number of samples is greater than number of barcodes in file!\n"
err.msg <- paste(err.msg, sprintf("Number of samples: %d;\n", num.samples),
sprintf("Number of barcodes: %d;\n", nrow(barcodes.df)))
stop(err.msg)
} else if (nrow(barcodes.df) == num.samples) {
err.msg <- "Specified number of samples equals to number of barcodes in file!\n"
err.msg <- paste(err.msg, "Why not just to use all barcodes from this file?")
stop(err.msg)
}
# Calculate dissimilarity matrix
# 'barcodes.df[,1:2]' is passed to 'calc.diss.matr'
# in order to have same syntax with "double"-formatted work mode.
message("Calculating distances...")
diss.matr <- calc.diss.matr(barcodes.df[,1:2])
# If input file is in "double" format -- calculate dissimilarity matrix
# for reverse barcodes. Final matrix will be calculated as Euclidean distance
# between the first and the second matrices.
if (double) {
reverse.matr <- calc.diss.matr(barcodes.df[,3:4])
diss.matr <- sqrt(diss.matr ^ 2 + reverse.matr ^ 2) # calculate final matrix
rm(reverse.matr) # remove!
# Rewrite rownames and colnames for double mode
rownames(diss.matr) <- paste(barcodes.df$Idx_name_F,
barcodes.df$Idx_name_R, sep = ' -- ')
colnames(diss.matr) <- paste(barcodes.df$Idx_name_F,
barcodes.df$Idx_name_R, sep = ' -- ')
}
ballance <-FALSE
backup.flag <- TRUE
clustering.msg <- "Clustering..."
while (! ballance) {
# Cluster barcodes
message(clustering.msg)
pam.clusters <- pam(diss.matr, num.samples, diss = T)
# Get best barcodes
medoids <- pam.clusters$medoids
message("Checking color ballance...")
ballance <- check.color.ballance(barcodes.df, medoids, double)
if (ballance) {
message("OK")
} else {
clustering.msg <- "Relustering..."
message("Color ballance is corrupted.")
message("Attepmting to reach color ballance...")
# Backup
if (backup.flag) {
init.medoids <- medoids
backup.flag <- FALSE
}
# Convert this matrix to data frame -- it will be more convinient
silh.df <- as.data.frame(pam.clusters$silinfo$widths)
# Add a column with names
silh.df$name <- rownames(pam.clusters$silinfo$widths)
# If no more barcodes left -- inform a user, break and use backuped barcodes.
if (nrow(silh.df) < num.samples) {
message("Color ballance cannot be reached.")
medoids <- init.medoids
break
}
# Find medoid with the smallest silhouette score.
med.silh.scores <- subset(silh.df, silh.df$name %in% medoids)
min.silh.idx <- which.min(med.silh.scores[,3])
old.barcode <- med.silh.scores$name[min.silh.idx]
# Remove medoid with min silhouette score from dissimilarity matrix
# and from dataframe
diss.matr <- diss.matr[rownames(diss.matr) != old.barcode,
colnames(diss.matr) != old.barcode]
silh.df <- subset(silh.df, silh.df$name != old.barcode)
cat("\n")
}
}
# Print the result
if (! double) {
cat("\nBest indices:\n")
for (barcode.name in medoids) {
cat(sprintf("%s\t%s\n",
barcode.name,
barcodes.df$Idx_seq[barcodes.df$Idx_name == barcode.name]))
}
} else {
cat("\nBest pairs of indices:\n")
for (barcode.name in medoids) {
forw.name <- get.forw.names(barcode.name)
forw.seq <- unique(barcodes.df$Idx_seq_F[barcodes.df$Idx_name_F == forw.name])
rev.name <- get.rev.names(barcode.name)
rev.seq <- unique(barcodes.df$Idx_seq_R[barcodes.df$Idx_name_R == rev.name])
cat(sprintf("%s\t%s\t%s\t%s\n",
forw.name,forw.seq,
rev.name, rev.seq))
}
}