Skip to content

Commit

Permalink
Merge branch 'dev' into 126-docs-describe-workflow-rationale
Browse files Browse the repository at this point in the history
  • Loading branch information
deliaBlue authored Jul 31, 2024
2 parents 077cf8b + 993ffac commit 9e3c639
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 54 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ dependencies:
- graphviz=7.1.0
- python=3.9.16
- snakemake=7.24.0
- pulp=2.7.0
212 changes: 158 additions & 54 deletions scripts/merge_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,83 +7,103 @@
#################

# Import required packages
if ( suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE ) { stop("[ERROR] Package 'optparse' required! Aborted.") }
if ( suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE ) { stop("[ERROR] Package 'dplyr' required! Aborted.") }
if (
suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE
) {
stop("[ERROR] Package 'optparse' required! Aborted.")
}
if (
suppressWarnings(suppressPackageStartupMessages(require("dplyr"))) == FALSE
) {
stop("[ERROR] Package 'dplyr' required! Aborted.")
}


#######################
### PARSE OPTIONS ###
#######################

# Get script name
script <- sub("--file=", "", basename(commandArgs(trailingOnly=FALSE)[4]))
script <- sub("--file=", "", basename(commandArgs(trailingOnly = FALSE)[4]))

# Build description message
description <- "Merge miRNAs quantification tables.\n"
author <- "Author: Paula Iborra, Biozentrum, University of Basel"
version <- "Version: 1.0.0 (JUN-2019)"
requirements <- "Requires: optparse"
msg <- paste(description, author, version, requirements, sep="\n")
author <- "Author: Zavolan Lab <[email protected]>"
maintainer <- "Maintainer: Iris Mestres <[email protected]>"
version <- "Version: 1.1.0 (FEB-2024)"
requirements <- "Requires: optparse, dplyr"
msg <- paste(description, author, maintainer, version, requirements, sep = "\n")

# Define list of arguments
option_list <- list(
make_option(
"--input_dir",
action="store",
type="character",
default=getwd(),
help="Absolute path from where input files shall be readed. Required!",
metavar="directory"
action = "store",
type = "character",
default = getwd(),
help = "Absolute path from where input files shall be readed. Required!",
metavar = "directory"
),
make_option(
"--output_file",
action="store",
type="character",
default=file.path(getwd(), "counts.tab"),
help="Table output file path. Default: $PWD/counts.tab",
metavar="directory"
action = "store",
type = "character",
default = file.path(getwd(), "counts.tab"),
help = "Table output file path. Default: $PWD/counts.tab",
metavar = "directory"
),
make_option(
c("--prefix"),
action="store_true",
type="character",
default=NULL,
help="Prefix for reading input files. Default: NULL.",
metavar="file"
action = "store_true",
type = "character",
default = NULL,
help = "Prefix for reading input files. Default: NULL.",
metavar = "file"
),
make_option(
c("-h", "--help"),
action="store_true",
default=FALSE,
help="Show this information and die."
action = "store_true",
default = FALSE,
help = "Show this information and die."
),
make_option(
c("-u", "--usage"),
action="store_true",
default=FALSE,
dest="help",
help="Show this information and die."
action = "store_true",
default = FALSE,
dest = "help",
help = "Show this information and die."
),
make_option(
c("-v", "--verbose"),
action="store_true",
default=FALSE,
help="Print log messages to STDOUT."
action = "store_true",
default = FALSE,
help = "Print log messages to STDOUT."
)
)

# Parse command-line arguments
opt_parser <- OptionParser(usage=paste("Usage:", script, "[OPTIONS] --input_dir <path/to/input/files>\n", sep=" "), option_list = option_list, add_help_option=FALSE, description=msg)
opt_parser <-
OptionParser(
usage = paste(
"Usage:",
script,
"[OPTIONS] --input_dir <path/to/input/files>\n",
sep = " "
),
option_list = option_list,
add_help_option = FALSE,
description = msg
)
opt <- parse_args(opt_parser)

# Re-assign variables
in.dir <- opt$`input_dir`
in_dir <- opt$`input_dir`
prefix <- opt$`prefix`
out.file <- opt$`output_file`
out_file <- opt$`output_file`
verb <- opt$`verbose`

# Validate required arguments
if ( is.null(in.dir) ) {
if (is.null(in_dir)) {
print_help(opt_parser)
stop("[ERROR] Required argument missing! Aborted.")
}
Expand All @@ -92,19 +112,84 @@ if ( is.null(in.dir) ) {
### FUNCTIONS ###
######################

merge_tables <- function(cwd, prefix){
dataFiles <- dir(cwd, prefix, full.names=TRUE)
#' Read and process input table
#'
#' `get_table()` uses `tryCatch()` to read the file in `tbl_pth`. If the table
#' is empty and an error is raised, a data frame is created.
#'
#' @param tbl_pth Path to the input table.
#' @param prefix String to be removed from the input file name. It must be
#' present in all the tables to be merged.
#'
#' @returns `get_table()` returns a data frame containing the miRNA species to
#' be counted in first column, named `ID`, and their counts in that file in
#' the second one. The name of the second column in the data frame is obtained
#' by removing the `prefix` from the input file name. If no `prefix` is given,
#' the whole file name is used.
#'
#' If the input file is empty, the returned data frame consist of one row
#' with a `NA` in both fields.
#'
#' @seealso [tryCatch()] which this function uses.
get_table <- function(tbl_pth, prefix) {
sample <- gsub(prefix, "", tbl_pth)
fields <- c("ID", basename(sample))

tryCatch(
expr = {
table <- read.table(tbl_pth, sep = "\t", col.names = fields)
return(table)
},
error = function(e) {
table <- data.frame(matrix(NA, ncol = 2, nrow = 1))
colnames(table) <- fields
return(table)
}
)
}

#' Merge tables with the same prefix
#'
#' `merge_tables()` takes all the files in `cwd` that start with `prefix` and
#' merges them keeping all the miRNA species present in each of the tables.
#'
#' @details The function `get_table()` is used to make sure that even if an
#' empty input file is given, the merge can still be done by creating a
#' data frame with a single row made of `MA`s. Therefore, prior to the
#' returning of the merged table, if there is a row with a `NA` in the
#' `ID` filed, it is removed.
#'
#' The function `dplyr::full_join()` method is used for the merge. This
#' implies that if a miRNA species in `ID` is missing in any of the tables
#' being joined, its value is set to `NA` in that column.
#'
#' @param cwd Path to the input tables directory.
#' @param prefix String used in all the tables to be selected for the merge. If
#' not provided, all the files in `cwd` are used.
#'
#' @returns `merge_tables()` returns a single data frame, `mat`, with all the
#' miRNA species present in the input tables in the first column, `ID`, and
#' their counts. Each input file has it own column.
#'
#' If all the input tables are empty, the output only consist of the table's
#' header, and if no files starting with `prefix` are found, nothing is
#' returned.
#'
#' @seealso [get_table()], [dplyr::full_join()] which this function uses.
merge_tables <- function(cwd, prefix) {
data_files <- dir(cwd, prefix, full.names = TRUE)
mat <- NULL
if (length(dataFiles)) {
mat <- read.table(dataFiles[1], sep='\t')
sample <- gsub(prefix, "", dataFiles[1])
colnames(mat)[2] <- basename(sample)
for (i in seq_len(length(dataFiles)-1)) {
mat <- full_join(mat, read.table(dataFiles[i+1], sep = "\t"), by='V1')
sample <- gsub(prefix, "", dataFiles[i+1])
colnames(mat)[i + 2] <- basename(sample)

if (length(data_files)) {
mat <- get_table(data_files[1], prefix)
for (i in seq_len(length(data_files) - 1)) {
mat <- full_join(
mat,
get_table(data_files[i + 1], prefix),
by = "ID"
)
}
colnames(mat)[1] <- "ID"
mat <- filter(mat, !is.na("ID"))
}
return(mat)
}
Expand All @@ -113,22 +198,41 @@ merge_tables <- function(cwd, prefix){
### MAIN ###
######################
# Write log
if ( verb ) cat("Creating output directory...\n", sep="")
if (verb) {
cat("Creating output directory...\n", sep = "")
}

# Create output directories
dir.create(dirname(out.file), recursive=TRUE, showWarnings=FALSE)
dir.create(
dirname(out_file),
recursive = TRUE,
showWarnings = FALSE
)

# Write log
if ( verb ) cat("Creating table...\n", sep="")
if (verb) {
cat("Creating table...\n", sep = "")
}

# Create table from input directory files
myTable <- merge_tables(cwd=in.dir, prefix=prefix)
my_table <- merge_tables(cwd = in_dir, prefix = prefix)

# Write log
if ( verb ) cat(paste("Writing table: ", out.file, "\n", sep=""), sep="")
if (verb) {
cat(paste("Writing table: ", out_file, "\n", sep = ""), sep = "")
}

# Writing table
write.table(myTable, out.file, row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t")
write.table(
my_table,
out_file,
row.names = FALSE,
col.names = TRUE,
quote = FALSE,
sep = "\t"
)

# Write log
if ( verb ) cat("Done.\n", sep="")
if (verb) {
cat("Done.\n", sep = "")
}
2 changes: 2 additions & 0 deletions workflow/rules/pileup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ rule compress_reference_genome:
LOCAL_LOG / "compress_reference_genome.log",
container:
"docker://quay.io/biocontainers/samtools:1.16.1--h00cdaf9_2"
conda:
ENV_DIR / "samtools.yaml"
shell:
"(bgzip < {input.genome} > {output.genome}) &> {log}"

Expand Down

0 comments on commit 9e3c639

Please sign in to comment.