Skip to content

Commit

Permalink
feat: add pileup script
Browse files Browse the repository at this point in the history
  • Loading branch information
deliaBlue committed Jan 24, 2024
1 parent 01272af commit c852b5f
Showing 1 changed file with 355 additions and 0 deletions.
355 changes: 355 additions & 0 deletions scripts/ascii_alignment_pileup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
#!/usr/bin/env Rscript

#==================#
# HEADER START #
#==================#
### Created: Sep 29, 2019
### Author: Alexander Kanitz
### Affiliation: Zavolan Group, Biozentrum, University of Basel
### Email: [email protected]
#==================#
# HEADER END #
#==================#


#==========================#
# PRE-REQUISITES START #
#==========================#
#---> LOAD OPTION PARSER <---#
if (suppressWarnings(suppressPackageStartupMessages(require("optparse"))) == FALSE) { stop("Package 'optparse' required!\nExecution aborted.") }

#---> GET SCRIPT NAME <---#
args.all <- commandArgs(trailingOnly = FALSE)
name.field <- "--file="
script <- basename(sub(name.field, "", args.all[grep(name.field, args.all)]))

#---> DESCRIPTION <---#
description <- "Generates an ASCII-style pileup of read alignments in one or more BAM files
against one or more regions specified in a BED file.\n"
author <- "Author: Alexander Kanitz"
affiliation <- "Affiliation: Biozentrum, University of Basel"
email <- "Email: [email protected]"
version <- "1.2.1"
version_formatted <- paste("Version:", version, sep=" ")
requirements <- c("optparse", "rtracklayer", "GenomicAlignments", "tools")
requirements_txt <- paste("Requires:", paste(requirements, collapse=", "), sep=" ")
msg <- paste(description, author, affiliation, email, version_formatted, requirements_txt, sep="\n")
notes <- "Notes:
- For the input queries, consider the `--maximum-region-width` parameter, which
is provided for safety. While it is possible to increase it, wide regions may
require excessive memory.
"

#---> COMMAND-LINE ARGUMENTS <---#
## List of allowed/recognized arguments
option_list <- list(
make_option(
"--reference",
action="store",
type="character",
default=NULL,
help="Reference genome sequence in FASTA format. The file *MUST* be compressed with
BGZIP. If supplied, the reference sequence for the query region(s) will be added to
the output. Note that on the first run with a specific reference genome file, an FAI
index is generated which will take some time.",
metavar="file"
),
make_option(
"--annotations",
action="store",
type="character",
default=NULL,
help="Annotation file in GFF/GTF format used to annotate sequences. If supplied,
features overlapping the query region(s) will be visualized in the output. Ensure that
the argument to option `annotation-name-field` corresponds to a field in the
annotations, otherwise the script will fail.",
metavar="file"
),
make_option(
"--output-directory",
action="store",
type="character",
default=getwd(),
help="Output directory. One output file will be created for each region in `--bed` and
the filenames will be generated from the basenames of the supplied BAM file(s) and the
name field (4th column) of the BED file. [default \"%default\"]",
metavar="dir"
),
make_option(
"--maximum-region-width",
action="store",
type="integer",
default=200,
help="Maximum input region width. Use with care as wide regions will use excessive
resources. [default %default]",
metavar="int"
),
make_option(
"--do-not-collapse-alignments",
action="store_true",
type="logical",
default=FALSE,
help="Show alignments of reads with identical sequences individually."
),
make_option(
"--minimum-count",
action="store",
type="integer",
default=1,
help="Alignments of reads with less copies than the specified number will not be
printed. Option is not considered if `do-not-collapse-alignments` is set.
[default %default]",
metavar="int"
),
make_option(
"--annotation-name-field",
action="store",
type="character",
default="Name",
help="Annotation field used to populate the `name` column in the output.
[default \"%default\"]",
metavar="str"
),
make_option(
"--padding-character",
action="store",
default=".",
help="Character used for padding alignments. [default \"%default\"]",
metavar="char"
),
make_option(
"--indel-character",
action="store",
default="-",
help="Character to denote insertions and deletions in alignments. [default \"%default\"]",
metavar="char"
),
make_option(
"--prefix",
action="store",
type="character",
default=NULL,
help="Prefix to be used in the output file name(s). If not provided
the input BAM file(s) name will be used instead,",
metavar="string"
),
make_option(
c("-h", "--help"),
action="store_true",
default=FALSE,
help="Show this information and die."
),
make_option(
c("-v", "--verbose"),
action="store_true",
default=FALSE,
help="Print log messages to STDOUT."
)
)

## Parse options
opt_parser <- OptionParser(
usage=paste(script, "[--help] [--verbose] [OPTIONS] BED BAM [BAM2 ...]\n"),
option_list=option_list,
add_help_option=FALSE,
description=msg,
epilogue=notes
)
cli <- parse_args(opt_parser, positional_arguments=c(2, Inf))

# Re-assign CLI arguments
fl.query <- cli$args[[1]]
fl.bam <- cli$args[2:length(cli$args)]
fl.ref <- cli$options[["reference"]]
fl.anno <- cli$options[["annotations"]]
dir.out <- cli$options[["output-directory"]]
prefix.out <- cli$options[["prefix"]]
width.max <- cli$options[["maximum-region-width"]]
collapse <- ! cli$options[["do-not-collapse-alignments"]]
count.min <- cli$options[["minimum-count"]]
char.pad <- cli$options[["padding-character"]]
char.indel <- cli$options[["indel-character"]]
field.name.anno <- cli$options[["annotation-name-field"]]
verb <- cli$options[["verbose"]]
#==========================#
# PRE-REQUISITES END #
#==========================#


#================#
# MAIN START #
#================#
#---> START MESSAGE <---#
if (verb) cat("Starting '", script, "'...\n", sep="")

#---> LOAD REQUIRED LIBRARIES <---#
if (verb) cat("Loading libraries...\n", sep="")
for (req in requirements) {
if ( suppressWarnings(suppressPackageStartupMessages(require(req, character.only=TRUE))) == FALSE ) { stop("Package '", req, "' required!") }
}

#---> IMPORT FILES <---#
# Print status message
if (verb) cat("Importing input files...\n", sep="")
# Load files
bed <- import(con=fl.query)
if (! is.null(fl.ref)) {ref <- FaFile(fl.ref)}
if (! is.null(fl.anno)) {anno <- import(con=fl.anno)}

# Get file prefix from BAM files or from CLI argument
fl.prefix <- if (! is.null(prefix.out)) prefix.out else paste(basename(file_path_sans_ext(fl.bam)), collapse=".")

#---> <---#
# Print status message
if (verb) cat("Iterating over regions in BED file...\n")
# Iterate over input regions
for(index in seq_along(bed)) {

#---> PREPARE STACKING OF ALIGNMENTS <---#
# Assign current region
region <- bed[index]
# Print status message
if (verb) cat("Processing region '", mcols(region)[["name"]], "'...\n", sep="")
# Exit with error if region is too wide
if (width(region) > width.max) {stop("Supplied region too large. Consider increasing the `width.max` parameter, but note that for very large regions, the memory footprint may be excessive.")}
# Initialize DNAStringSet container object
seq.out <- DNAStringSet()

#---> ADD READ ALIGNMENTS <---#
# Print status message
if (verb) cat("Iterating over BAM files...\n")
# Iterate over BAM files
for (bam in fl.bam) {
# Print status message
if (verb) cat("Adding read alignments for file '", bam,"'...\n", sep="")
# Get alignments for current BAM file
seq.stack <- stackStringsFromBam(
bam,
param=region,
D.letter=char.indel,
N.letter=char.indel,
Lpadding.letter=char.pad,
Rpadding.letter=char.pad
)
# Add to container
seq.out <- append(seq.out, seq.stack)
}
# Get complement if on minus strand
if (as.character(strand(region))[[1]] == "-") {
seq.out <- complement(seq.out)
}
# Convert to character
seq.out <- as.character(seq.out)
# Convert to dataframe for writing output
df <- data.frame(seq=seq.out, count=rep(NA, length(seq.out)), stringsAsFactors=FALSE)

#---> COLLAPSE READ ALIGNMENTS <---#
if (collapse) {
# Print status message
if (verb) cat("Collapsing identical reads/alignments...\n")
# Get unique alignments
df <- data.frame(table(df[["seq"]]), stringsAsFactors=FALSE)
if (! ncol(df) == 2) df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(df) <- c("seq", "count")
df[["seq"]] <- as.character(df[["seq"]])
# Filter out any alignments that do not make the specified minimum count cutoff
df <- df[df[["count"]] >= count.min, ]
}

#---> SORTING ALIGNMENTS <---#
# Print status message
if (verb) cat("Sorting alignments...\n")
# Reverse sequence if on minus strand
if (as.character(strand(region))[[1]] == "-") {
df[["seq"]] <- reverse(df[["seq"]])
}
# Sort by position of first nucleotide, count and position of last nucleotide
if (nrow(df)) {
last_char <- nchar(df[["seq"]][[1]])
pos.nuc.first <- regexpr(paste0("[^", char.pad, "\\.]"), df[["seq"]])
pos.nuc.last <- last_char - regexpr(paste0("[^", char.pad, "\\.]"), unlist(lapply(df[["seq"]], reverse))) + 1
df <- df[order(
pos.nuc.first, df[["count"]], pos.nuc.last,
decreasing=c(FALSE, TRUE, FALSE)
), ]
}
# Reverse sequence again if on minus strand
if (as.character(strand(region))[[1]] == "-") {
df[["seq"]] <- reverse(df[["seq"]])
}

#---> ADD REFERENCE SEQUENCE <---#
if (! is.null(fl.ref)) {
# Print status message
if (verb) cat("Adding reference sequence...\n")
# Generate name for reference sequence
name.ref <- paste(
seqnames(region),
paste(start(region), end(region), sep="-"),
strand(region), sep=":"
)
# Get sequence
row.ref <- getSeq(ref, region) # will create .fai index if not present
# Get reverse if on minus strand
if (as.character(strand(region))[[1]] == "-") {
row.ref <- reverse(row.ref)
}
# Compile row and add to dataframe
df.ref <- data.frame(seq=as.character(row.ref), count=name.ref, stringsAsFactors=FALSE)
df <- rbind(df.ref, df)
}

#---> ADD ANNOTATIONS FOR REGION <---#
# Add annotations
if (! is.null(fl.anno)) {
# Print status message
if (verb) cat("Adding overlapping features...\n")
# Find overlapping features
features <- anno[to(findOverlaps(region, anno))]
# Set order of addition from most upstream to most downstream
if (as.character(strand(region)) == "+") {
order.features <- order(start(features))
} else {
order.features <- rev(order(end(features)))
}
# Order features
features <- features[order.features]
# Iterate over features
seqs <- sapply(seq_along(features), function(index) {
feat <- features[index]
# Set markup character depending on strand
char.anno <- ifelse(strand(region) == "+", ">", "<")
# Calculate lengths of feature (in region) & left/right padding
diff.start <- start(feat) - start(region)
n.padl <- max(0, diff.start)
n.anno <- min(min(0, diff.start) + width(feat), width(region) - n.padl)
n.padr <- width(region) - n.padl - n.anno
# Generate string encompassing feature
str.padl <- paste(rep(char.pad, n.padl), collapse="")
str.anno <- paste(rep(char.anno, n.anno), collapse="")
str.padr <- paste(rep(char.pad, n.padr), collapse="")
str.final <- paste0(str.padl, str.anno, str.padr)
})
df.anno <- data.frame(seq=seqs, count=mcols(features)[[field.name.anno]], stringsAsFactors=FALSE)
df <- rbind(df.anno, df)
}

#---> WRITE OUTPUT <---#
if (collapse) {
name.file <- paste(fl.prefix, mcols(region)[["name"]], "min", as.character(count.min), "pileup", "tab", sep=".")
} else {
name.file <- paste(fl.prefix, mcols(region)[["name"]], "uncollapsed", "pileup", "tab", sep=".")
}
fl.out <- file.path(dir.out, name.file)
# Print status message
if (verb) cat("Writing output to file '", fl.out, "'...\n", sep="")
# Write tab-separated output
write.table(df, fl.out, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE)
}

#---> END MESSAGE <---#
if (verb) cat("Done.\n\nSession info:\n")
if (verb) print(sessionInfo())
#================#
# MAIN END #
#================#

0 comments on commit c852b5f

Please sign in to comment.