README.rmd

---
title: "L. major UTR Analysis"
output:
  knitrBootstrap::bootstrap_document:
    theme: flatly
    clean_supporting: false
---

```{r knitr_settings, include=FALSE}
library(knitr)
opts_chunk$set(fig.width=1280/96,
               fig.height=720/96,
               dpi=96)
```

L. major UTR Length and Alternative Trans-splicing / Poly-adenylation Analysis
==============================================================================

Overview
--------

The goal of this analysis is to parse the output from our [UTR analysis
pipeline](https://github.com/khughitt/utr_analysis), perform some basic
smoothing of the spliced leader and polya acceptor site counts and determine
the most likely primary UTR boundaries for each gene where information is
available. 5'- and 3'-UTR boundaries, along with some other useful information
such as UTR GC- and CT-richness will be outputted in a format that is
convenient for downstream analysis.

Finally, we will look for evidence of either alternative trans-splicing or
poly-adenylation and attempt to visualize the prevelance and magnitude of
these events across the different developmental stages.

TODO
----

- Include Trey's uORFs
- Removal RNAs (SLRNA, etc)

```{r run_date, results='asis', echo=FALSE}
email = "<a href='mailto:khughitt@umd.edu'>Keith Hughitt</a>"
last_update = format(Sys.time(), "(<time>%Y-%m-%d</time>)")
cat(paste(email, last_update))

# Clean up any existing variables
rm(list=ls())
```

Settings
--------

```{r settings}
# Number of individual UTR plots to display for each developmental stage and
# UTR side. The purpose of these plots are to provide a sense of the data
# distribution and to visualize the effect of primary # site selection with and
# without smoothing.
max_plots = 10
```

[view source](README.rmd)

Methods
-------

### Load data

```{r load_annotations, warning=FALSE}
library(GenomicRanges)
library(Biostrings)
library(rtracklayer)
library(ggplot2)
library(ggvis)
library(reshape2)
library(dplyr)

# Genome sequence and annotations from TriTrypDB (8.0)
gff = import.gff(file.path('/cbcb/lab/nelsayed/ref_data/lmajor_friedlin',
                           '/annotation/TriTrypDB-8.0_LmajorFriedlin.gff'),
                 version='3')

chromosomes = gff[gff$type == 'chromosome']
genes       = gff[gff$type == 'gene']

# Load unannotated ORFs detected from ribosome profiling data
uorfs = import.gff(gzfile('input/lmajor_orfs_manual_2014-09-08_clean.gff.gz'), version='3')
uorfs$description = 'Unannotated ORF'

# Drop GFF columns not shared betwene TriTrypDB GFF and uORF GFF
keep_cols = intersect(colnames(mcols(genes)), colnames(mcols(uorfs)))
genes = genes[,keep_cols]
uorfs = uorfs[,keep_cols]

genes = append(genes, uorfs)

input_fasta = file.path(Sys.getenv("REF"), 
                        "lmajor_friedlin/genome/TriTrypDB-8.0_LmajorFriedlin_Genome.fasta")
fasta = readDNAStringSet(input_fasta)

# Fix names (L. major chromosome identifiers)
names(fasta) = substring(names(fasta), 0, 7)

# Number of bases flanking CDS to scan for motifs when actual UTR length is not
# known; numbers are based on median 5' and 3' UTR lengths for L. major
default_5utr_width = 250
default_3utr_width = 575

# Taken from L. major UTR analysis output from 2014/11/14
procyclic_polya  = import.gff(gzfile('input/lmajor_procyclic_ncrnas_removed_polya.gff.gz'), version='3')
metacyclic_polya = import.gff(gzfile('input/lmajor_metacyclic_ncrnas_removed_polya.gff.gz'), version='3')
amastigote_polya = import.gff(gzfile('input/lmajor_amastigote_ncrnas_removed_polya.gff.gz'), version='3')
procyclic_sl  = import.gff(gzfile('input/lmajor_procyclic_ncrnas_removed_sl.gff.gz'), version='3')
metacyclic_sl = import.gff(gzfile('input/lmajor_metacyclic_ncrnas_removed_sl.gff.gz'), version='3')
amastigote_sl = import.gff(gzfile('input/lmajor_amastigote_ncrnas_removed_sl.gff.gz'), version='3')

# Filter noncoding RNAs
id_filter_string = 'rRNA|snRNA|snoRNA|SLRNA|TRNA|SRP'
noncoding_ids = genes$ID[grepl(id_filter_string, genes$ID)]

genes            = genes[!genes$ID %in% noncoding_ids]
procyclic_polya  = procyclic_polya[!procyclic_polya$Name %in% noncoding_ids,]
metacyclic_polya = metacyclic_polya[!metacyclic_polya$Name %in% noncoding_ids,]
amastigote_polya = amastigote_polya[!amastigote_polya$Name %in% noncoding_ids,]
procyclic_sl     = procyclic_sl[!procyclic_sl$Name %in% noncoding_ids,]
metacyclic_sl    = metacyclic_sl[!metacyclic_sl$Name %in% noncoding_ids,]
amastigote_sl    = amastigote_sl[!amastigote_sl$Name %in% noncoding_ids,]

# Create output and build directories if needed
for (x in c('build', 'output')) {
    if (!file.exists(x)) {
        dir.create(x)
    }
}
```

```{r helper_functions}
#
# find_peak
#
find_peak = function(gene, acceptor_sites, gene_strand, feature_type,
                     smoothed=FALSE, include_plot=FALSE, secondary=FALSE) {
    # Determine orientation of feature relative to CDS
    if ((feature_type == 'sl'    && gene_strand == '+') ||
        (feature_type == 'polya' && gene_strand == '-')) {
        feature_side = 'left'
    } else {
        feature_side = 'right'
    }

    # Create a vector from the furthest SL site to the CDS boundary
    if (feature_side == 'left') {
        cds_boundary = start(gene)
        raw_scores = rep(0, cds_boundary - min(start(acceptor_sites)))
        rel_start =  cds_boundary - start(acceptor_sites) + 1
    } else {
        cds_boundary = end(gene)
        raw_scores = rep(0, max(start(acceptor_sites)) - cds_boundary)
        rel_start = start(acceptor_sites) - cds_boundary + 1
    }

    # add scores at indices where acceptor sites were detected
    raw_scores[rel_start] = score(acceptor_sites)
    x = 1:length(raw_scores)

    # create smooted version of scores (optional)
    if (smoothed) {
        input_scores = ksmooth(x, raw_scores, kernel="normal", bandwidth=6,
                               n.points=length(x))$y
    } else {
        input_scores = raw_scores
    }

    # find acceptor site peak
    if (secondary && (length(acceptor_sites) >= 2)) {
        # secondary site
        j = 2
    } else {
        # primary site
        j = 1
    }
    sorted_scores = sort(input_scores, decreasing=TRUE)
    raw_idx       = which(input_scores == sorted_scores[j])

    if (length(raw_idx) > 1) {
        if (feature_side == 'left') {
            raw_idx = head(raw_idx, 1)
        } else {
            raw_idx = tail(raw_idx, 1)
        }
    }

    # find raw score (number of reads) for the desired primary or secondary
    # site peak
    raw_score = raw_scores[raw_idx]

    # If smoothing was used, the actual location of the smoothed peak may not
    # have any support. In this case we will find the nearest site to the peak
    # with a non-zero score in the raw scores vector
    while(raw_score == 0) {
        # find next highest peak
        j = j + 1 
        raw_idx = which(input_scores == sorted_scores[j])

        # if more than one peak, choose the furthest one from CDS
        if (length(raw_idx) > 1) {
            if (feature_side == 'left') {
                raw_idx = head(raw_idx, 1)
            } else {
                raw_idx = tail(raw_idx, 1)
            }
        }
        raw_score = raw_scores[raw_idx]
    }

    # plot raw and smoothed peaks
    if (include_plot) {
        if (smoothed) {
            df = melt(data.frame(x, raw=raw_scores, smoothed=input_scores),
                      id=c("x"), variable.name='type')
            print(qplot(x, value, data=df, color=type, geom='line') 
                  + ggtitle(gene$ID))
        } else {
            df = melt(data.frame(x, raw=raw_scores), id=c("x"),
                    variable.name='type')
            print(qplot(x, value, data=df, geom='line')
                  + ggtitle(gene$ID))
        }
    }

    # find index of the site with the highest score (or second highest score,
    # in the case of secondary site selection)
    idx = which(score(acceptor_sites) == raw_score)

    # if there is a tie, use the furthest site from CDS
    if (length(idx) > 1) {
        if (feature_side == 'left') {
            idx = head(idx, 1)
        } else {
            idx = tail(idx, 1)
        }
    }
    return(idx)
}

#
# find_primary_site
#
# Determines the primary site among a list of acceptor sites and their
# associated score (number of reads mapped).
#
# Returns a list containing the site and score for the primary site.
#
find_primary_site = function(acceptor_sites, feature, gene, gene_strand, smoothed=FALSE) {
    if (length(acceptor_sites) == 0) {
        return(list(location=NA, num_reads=NA))
    } else if (length(acceptor_sites) == 1) {
        # if only one site found, use it
        return(list(location=start(acceptor_sites)[1],
                    num_reads=score(acceptor_sites)[1]))
    } else {
        # two or more sites

        # find highest smoothed peak which has non-zero coverage in the raw
        # data as well
        max_index = find_peak(gene, acceptor_sites, gene_strand, feature,
                              smoothed=smoothed)
        max_index_smoothed = find_peak(gene, acceptor_sites, gene_strand,
                                       feature, smoothed=TRUE)

        # if the smoothing would result in a different primary UTR choice,
        # plot the raw and smoothed versions of the data
        if (abs(max_index - max_index_smoothed) > 15) {
            num_diff = num_diff + 1
            if (plot_num <= max_plots) {
                sprintf("PLOTTING %d/%d", plot_num, max_plots)
                #tmp = find_peak(gene, acceptor_sites, gene_strand, feature,
                #                smoothed=TRUE, include_plot=TRUE)
                plot_num = plot_num + 1
            }
        }

        return(list(location=start(acceptor_sites)[max_index], 
                    num_reads=score(acceptor_sites)[max_index]))
    }
}

#
# find_secondary_site
#
# Determines the secondary site among a list of acceptor sites and their
# associated score (number of reads mapped).
#
# Returns a list containing the site and score for the secondary site.
#
find_secondary_site = function(acceptor_sites, feature, gene, gene_strand,
                               smoothed=FALSE) {
    if (length(acceptor_sites) == 0) {
        return(list(location=NA, num_reads=NA))
    } else if (length(acceptor_sites) == 1) {
        # if only one site found, use it
        return(list(location=start(acceptor_sites)[1],
                    num_reads=score(acceptor_sites)[1]))
    } else {
        # two or more sites

        # find highest smoothed peak which has non-zero coverage in the raw
        # data as well
        secondary_site_index = find_peak(gene, acceptor_sites, gene_strand,
                                         feature, smoothed=smoothed,
                                         secondary=TRUE)
        secondary_site_index_smoothed = find_peak(gene, acceptor_sites,
                                                  gene_strand, feature,
                                                  smoothed=TRUE,
                                                  secondary=TRUE)

        # if the smoothing would result in a different secondary UTR choice,
        # plot the raw and smoothed versions of the data
        #if (abs(secondary_site_index - secondary_site_index_smoothed) > 15) {
        #    num_diff = num_diff + 1
        #    if (plot_num <= max_plots) {
        #        tmp = find_peak(gene, acceptor_sites, gene_strand, feature,
        #                        smoothed=TRUE, include_plot=TRUE,
        #                        secondary=TRUE)
        #        plot_num = plot_num + 1
        #    }
        #}

        return(list(location=start(acceptor_sites)[secondary_site_index], 
                    num_reads=score(acceptor_sites)[secondary_site_index]))
    }
}

#
# get_utr_sequences
#
# Returns a vector of Biostrings instances containing the UTR sequence for each
# input gene.
#
get_utr_sequences = function(genes, fasta, utr_lengths, default_width,
                             utr5=TRUE) {
    # retrieve utr length if known
    widths = data.frame(
        id=genes$ID,
        width=NA)

    widths$width = utr_lengths[match(genes$ID, utr_lengths$name),]$length
    widths$width[is.na(widths$width)] = default_width

    # get positive and negative strand genes
    if (utr5) { 
        utr = flank(genes, widths$width)
    } else {
        utr = flank(genes, widths$width, start=FALSE)
    }
    pos_strand = utr[as.character(strand(utr)) == "+"]
    neg_strand = utr[as.character(strand(utr)) == "-"]

    # for genes that were assigned the default UTR size, make sure that the                                                                                                               
    # assigned boundaries fall within the chromosome                                                                                                                                      
    start(pos_strand) = pmax(1, start(pos_strand))                                                                                                                                        
    start(neg_strand) = pmax(1, start(neg_strand))                                                                                                                                        

    end(pos_strand) = pmin(end(pos_strand), width(fasta[seqnames(pos_strand)]))                                                                                                           
    end(neg_strand) = pmin(end(neg_strand), width(fasta[seqnames(neg_strand)]))  

    seqs = fasta[pos_strand]
    seqs = append(seqs, reverseComplement(fasta[neg_strand]))
    names(seqs) = c(pos_strand$ID, neg_strand$ID)

    return(seqs)
}

#
# get_num_reads
#
# Returns the number of reads mapped to a given position for the specified
# stage, or 0 if none are found
#
get_num_reads = function(utr_reads, site) {
    if (is.na(site)) {
        return(NA)
    }
    num_reads = score(utr_reads[start(utr_reads) == site])
    return(ifelse(length(num_reads) == 0, 0, num_reads))
}
```

### Compute UTR coordinates and features

#### 5'UTR

##### 5'UTR Length

```{r compute_5utr_lengths, message=FALSE, cache=TRUE, autodep=TRUE}
# Output columns
procyclic_lengths             = c()
procyclic_num_reads           = c()
procyclic_num_reads_primary   = c()
procyclic_num_reads_secondary = c()

metacyclic_lengths             = c()
metacyclic_num_reads           = c()
metacyclic_num_reads_primary   = c()
metacyclic_num_reads_secondary = c()

amastigote_lengths             = c()
amastigote_num_reads           = c()
amastigote_num_reads_primary   = c()
amastigote_num_reads_secondary = c()

combined_lengths             = c()
combined_num_reads           = c()
combined_num_reads_primary   = c()
combined_num_reads_secondary = c()

# Vectors to keep track of stage-specific primary site coverage across
# different stages (e.g. amastigote primary site reads in procyclic samples)
# This will be useful later on when looking for evidence of alternative trans-
# splicing and polyadenylation.
#
# For example, "pro_meta_primary_sl_reads" will contain the number of
# metacyclic reads mapped to the metacyclic primary site.
#
# Note 2015/01/15 -- for now, we will use the simpler "primary to secondary"
# ratios *within* each condition to give a sense of the degree of "dominance"
# for a given primary site.
#
pro_meta_primary_sl_reads = c()
pro_amast_primary_sl_reads = c()
meta_pro_primary_sl_reads = c()
meta_amast_primary_sl_reads = c()
amast_pro_primary_sl_reads = c()
amast_meta_primary_sl_reads = c()

# Start GFF output for combined set of acceptor sites
gff_lines = c("##gff-version\t3",
              "##feature-ontology\tsofa.obo",
              "##attribute-ontology\tgff3_attributes.obo")

# Add chromosome entries
for (i in 1:length(chromosomes)) {
    ch = chromosomes[i]
    gff_lines = append(gff_lines, paste("##sequence-region", ch$Name, 1,
                                        ch$size, sep='\t'))
}

# GFF chromosome entries
#for (i in 1:length(chromosomes)) {
#    ch = chromosomes[i]
#    # ID=LmjF.01;Name=LmjF.01;description=LmjF.01;size=268988;web_id=LmjF.01;
#    # molecule_type=dsDNA;organism_name=Leishmania
#    #  major;translation_table=1;topology=linear;localization=nuclear;
#    # Dbxref=ApiDB:LmjF.01,taxon:347515

#    descr_template = paste0(
#        "ID=%s;Name=%s;description=%s;size=%s;web_id=%s;",
#        "molecule_type=%s;organism_name=%s;translation_table=%s",
#        "topology=%s;localization=%s;Dbxref=%s")
        
#    descr = sprintf(descr_template,
#                    ch$ID, ch$Name, ch$description, ch$size, ch$web_id,
#                    ch$molecule_type, ch$organism_name, ch$translation_table,
#                    ch$topology, ch$localization, 
    
#                    paste(ch$Dbxref[[1]][1], ch$Dbxref[[1]][2], sep=','))
#    gff_lines = append(gff_lines, paste(ch$Name, "TriTrypDB", "chromosome", 1,
#                                        ch$size, ".", "+", ".", descr,
#                                        sep='\t'))
#}

# keep track of the number of plots created
num_diff = 0
plot_num = 0

# Add gene entries
i = 1
for (gene_id in genes$ID) {
    message(sprintf("Processing SL sites for gene %d/%d", i, length(genes)))
    gene = genes[genes$ID == gene_id]

    gene_strand = as.character(strand(gene)) 
    # get all of the SL acceptor sites as a GRanges object
    metacyclic_utr5 = metacyclic_sl[metacyclic_sl$Name == gene_id]
    procyclic_utr5  = procyclic_sl[procyclic_sl$Name == gene_id]
    amastigote_utr5  = amastigote_sl[amastigote_sl$Name == gene_id]
    combined_utr5   = metacyclic_sl[metacyclic_sl$Name == gene_id]

    # total number of reads found which contain an acceptor site
    metacyclic_num_reads = append(metacyclic_num_reads, 
                                  sum(metacyclic_utr5$score))
    procyclic_num_reads  = append(procyclic_num_reads,
                                  sum(procyclic_utr5$score))
    amastigote_num_reads  = append(amastigote_num_reads,
                                  sum(amastigote_utr5$score))
    combined_num_reads   = append(combined_num_reads,
                                  sum(procyclic_utr5$score) +
                                  sum(metacyclic_utr5$score) +
                                  sum(amastigote_utr5$score))
    
    # Combined output

    # Add procyclic reads
    if (length(procyclic_utr5) > 0) {
        for (j in 1:length(procyclic_utr5)) {
            # if new site, add a new entry
            entry = procyclic_utr5[j]
            if(!start(entry) %in% start(combined_utr5)) {
                combined_utr5 = c(combined_utr5, entry)
            }
            # otherwise add procyclic score to existing metacyclic score
            else {
                score(combined_utr5[start(combined_utr5) == start(entry)]) = (
                    score(combined_utr5[start(combined_utr5) == start(entry)]) +
                    score(entry))
            }
        }
    }
    # Add amastigote reads
    if (length(amastigote_utr5) > 0) {
        for (j in 1:length(amastigote_utr5)) {
            # if new site, add a new entry
            entry = amastigote_utr5[j]
            if(!start(entry) %in% start(combined_utr5)) {
                combined_utr5 = c(combined_utr5, entry)
            }
            # otherwise add amastigote score to existing metacyclic score
            else {
                score(combined_utr5[start(combined_utr5) == start(entry)]) = (
                    score(combined_utr5[start(combined_utr5) == start(entry)]) +
                    score(entry))
            }
        }
    }
    i = i + 1

    # Determine length and scores for procyclic, metacyclic, amastigote, 
    # and combined outputs
    pro_primary_site      = find_primary_site(procyclic_utr5, 'sl', gene, gene_strand)
    meta_primary_site     = find_primary_site(metacyclic_utr5, 'sl', gene, gene_strand)
    amast_primary_site    = find_primary_site(amastigote_utr5, 'sl', gene, gene_strand)
    combined_primary_site = find_primary_site(combined_utr5, 'sl', gene, gene_strand)

    # secondary site
    pro_secondary_site      = find_secondary_site(procyclic_utr5, 'sl', gene, gene_strand)
    meta_secondary_site     = find_secondary_site(metacyclic_utr5, 'sl', gene, gene_strand)
    amast_secondary_site    = find_secondary_site(amastigote_utr5, 'sl', gene, gene_strand)
    combined_secondary_site = find_secondary_site(combined_utr5, 'sl', gene, gene_strand)

    # compute 5'utr length and coordinates
    if (gene_strand == '+') {
        # procylic + strand
        procyclic_utr5_length = start(gene) - pro_primary_site$location

        # metacyclic + strand
        metacyclic_utr5_length = start(gene) - meta_primary_site$location

        # amastigote + strand
        amastigote_utr5_length = start(gene) - amast_primary_site$location

        # combined + strand
        combined_utr5_start  = combined_primary_site$location + 1
        combined_utr5_end    = start(gene) - 1
        combined_utr5_length = start(gene) - combined_primary_site$location
    } else {
        # procyclic - strand
        procyclic_utr5_length = pro_primary_site$location - end(gene)

        # metacyclic - strand
        metacyclic_utr5_length = meta_primary_site$location - end(gene)

        # procyclic - strand
        amastigote_utr5_length = amast_primary_site$location - end(gene)
        
        # combined - strand
        combined_utr5_start  = end(gene) + 1
        combined_utr5_end    = combined_primary_site$location - 1
        combined_utr5_length = combined_primary_site$location - end(gene)
    }

    # Add primary site read count and UTR length
    procyclic_lengths  = append(procyclic_lengths,  procyclic_utr5_length)
    metacyclic_lengths = append(metacyclic_lengths, metacyclic_utr5_length)
    amastigote_lengths = append(amastigote_lengths, amastigote_utr5_length)
    combined_lengths   = append(combined_lengths,   combined_utr5_length)

    procyclic_num_reads_primary  = append(procyclic_num_reads_primary,
                                          pro_primary_site$num_reads)
    metacyclic_num_reads_primary = append(metacyclic_num_reads_primary,
                                          meta_primary_site$num_reads)
    amastigote_num_reads_primary = append(amastigote_num_reads_primary,
                                          amast_primary_site$num_reads)
    combined_num_reads_primary   = append(combined_num_reads_primary,
                                          combined_primary_site$num_reads)

    procyclic_num_reads_secondary  = append(procyclic_num_reads_secondary,
                                            pro_secondary_site$num_reads)
    metacyclic_num_reads_secondary = append(metacyclic_num_reads_secondary,
                                            meta_secondary_site$num_reads)
    amastigote_num_reads_secondary = append(amastigote_num_reads_secondary,
                                            amast_secondary_site$num_reads)
    combined_num_reads_secondary   = append(combined_num_reads_secondary,
                                            combined_secondary_site$num_reads)

    # Update counts for cross-stage primary sites
    pro_meta_primary_sl_reads = append(pro_meta_primary_sl_reads,
                                 get_num_reads(procyclic_utr5, meta_primary_site$location))
    pro_amast_primary_sl_reads = append(pro_amast_primary_sl_reads,
                                 get_num_reads(procyclic_utr5, amast_primary_site$location))
    meta_pro_primary_sl_reads = append(meta_pro_primary_sl_reads,
                                 get_num_reads(metacyclic_utr5, pro_primary_site$location))
    meta_amast_primary_sl_reads = append(meta_amast_primary_sl_reads,
                                 get_num_reads(metacyclic_utr5, amast_primary_site$location))
    amast_pro_primary_sl_reads = append(amast_pro_primary_sl_reads,
                                 get_num_reads(amastigote_utr5, pro_primary_site$location))
    amast_meta_primary_sl_reads = append(amast_meta_primary_sl_reads,
                                 get_num_reads(amastigote_utr5, meta_primary_site$location))

    # Add GFF entry
    descr = sprintf("ID=%s_5utr;Name=%s;description=%s", gene$ID, gene$ID,
                    gene$description)

    gff_entry = paste(
        seqnames(gene),
        "El-Sayed",
        "five_prime_UTR",
        combined_utr5_start,
        combined_utr5_end,
        combined_primary_site$num_reads,
        strand(gene),
        '.',
        descr, sep='\t')

    gff_lines = append(gff_lines, gff_entry)
}

# metacyclic
metacyclic_utr5_df = data.frame(
    name=genes$ID,
    length=metacyclic_lengths,
    num_reads=metacyclic_num_reads,
    num_reads_primary=metacyclic_num_reads_primary,
    num_reads_secondary=metacyclic_num_reads_secondary
)

# procyclic
procyclic_utr5_df = data.frame(
    name=genes$ID,
    length=procyclic_lengths,
    num_reads=procyclic_num_reads,
    num_reads_primary=procyclic_num_reads_primary,
    num_reads_secondary=procyclic_num_reads_secondary
)

# amastigote
amastigote_utr5_df = data.frame(
    name=genes$ID,
    length=amastigote_lengths,
    num_reads=amastigote_num_reads,
    num_reads_primary=amastigote_num_reads_primary,
    num_reads_secondary=amastigote_num_reads_secondary
)

# combined
combined_utr5_df = data.frame(
    name=genes$ID,
    length=combined_lengths,
    num_reads=combined_num_reads,
    num_reads_primary=combined_num_reads_primary,
    num_reads_secondary=combined_num_reads_secondary
)

# Create cross-stage site usage dataframes
# For example, "pro_other_sl_sites"  lists the number of procyclic reads 
# which mapped to the primary sites detected for other stages (amast and meta).
pro_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   metacyclic=pro_meta_primary_sl_reads,
   amastigote=pro_amast_primary_sl_reads
))

meta_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=meta_pro_primary_sl_reads,
   amastigote=meta_amast_primary_sl_reads
))

amast_other_sl_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=amast_pro_primary_sl_reads,
   metacyclic=amast_meta_primary_sl_reads
))
```

##### 5'UTR Composition

```{r utr5_composition}
# Metacyclic
metacyclic_utr5_sequences = get_utr_sequences(genes, fasta, metacyclic_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(metacyclic_utr5_sequences)[,1:4] 
metacyclic_utr5_features_df = data.frame(
    name=names(metacyclic_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
metacyclic_utr5_df = merge(metacyclic_utr5_df, metacyclic_utr5_features_df,
                           by='name')

# Procyclic
procyclic_utr5_sequences = get_utr_sequences(genes, fasta, procyclic_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(procyclic_utr5_sequences)[,1:4] 
procyclic_utr5_features_df = data.frame(
    name=names(procyclic_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
procyclic_utr5_df = merge(procyclic_utr5_df, procyclic_utr5_features_df,
                          by='name')

# amastigote
amastigote_utr5_sequences = get_utr_sequences(genes, fasta, amastigote_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(amastigote_utr5_sequences)[,1:4] 
amastigote_utr5_features_df = data.frame(
    name=names(amastigote_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
amastigote_utr5_df = merge(amastigote_utr5_df, amastigote_utr5_features_df,
                           by='name')

# Combined
combined_utr5_sequences = get_utr_sequences(genes, fasta, combined_utr5_df, 
                                              default_5utr_width, utr5=TRUE)
freqs = alphabetFrequency(combined_utr5_sequences)[,1:4] 
combined_utr5_features_df = data.frame(
    name=names(combined_utr5_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
combined_utr5_df = merge(combined_utr5_df, combined_utr5_features_df,
                         by='name')
```

```{r utr5_output}
# Write results
write.csv(metacyclic_utr5_df, file='output/lmajor_metacyclic_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(procyclic_utr5_df, file='output/lmajor_procyclic_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(amastigote_utr5_df, file='output/lmajor_amastigote_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(combined_utr5_df, file='output/lmajor_combined_5utr_lengths.csv',
          quote=FALSE, row.names=FALSE)

# Write result to GFF
fp = file("output/lmajor_5utr.gff")
writeLines(gff_lines, fp)
close(fp)
```

#### 3'UTR

```{r compute_3utr_lengths, message=FALSE, cache=TRUE, autodep=TRUE}
# Output columns
procyclic_lengths           = c()
procyclic_num_reads         = c()
procyclic_num_reads_primary = c()
procyclic_num_reads_secondary = c()

metacyclic_lengths           = c()
metacyclic_num_reads         = c()
metacyclic_num_reads_primary = c()
metacyclic_num_reads_secondary = c()

amastigote_lengths           = c()
amastigote_num_reads         = c()
amastigote_num_reads_primary = c()
amastigote_num_reads_secondary = c()

combined_lengths           = c()
combined_num_reads         = c()
combined_num_reads_primary = c()
combined_num_reads_secondary = c()

# Vectors to keep track of stage-specific primary site coverage across
# different stages (e.g. amastigote primary site reads in procyclic samples)
pro_meta_primary_polya_reads = c()
pro_amast_primary_polya_reads = c()
meta_pro_primary_polya_reads = c()
meta_amast_primary_polya_reads = c()
amast_pro_primary_polya_reads = c()
amast_meta_primary_polya_reads = c()

# Start GFF output for combined set of acceptor sites
gff_lines = c("##gff-version\t3",
             "##feature-ontology\tsofa.obo",
             "##attribute-ontology\tgff3_attributes.obo")

# Add chromosome entries
for (i in 1:length(chromosomes)) {
    ch = chromosomes[i]
    gff_lines = append(gff_lines, paste("##sequence-region", ch$Name, 1,
                                        ch$size, sep='\t'))
}

# GFF chromosome entries
#for (i in 1:length(chromosomes)) {
#    ch = chromosomes[i]
#    # ID=LmjF.01;Name=LmjF.01;description=LmjF.01;size=268988;web_id=LmjF.01;
#    # molecule_type=dsDNA;organism_name=Leishmania
#    #  major;translation_table=1;topology=linear;localization=nuclear;
#    # Dbxref=ApiDB:LmjF.01,taxon:347515

#    descr_template = paste0(
#        "ID=%s;Name=%s;description=%s;size=%s;web_id=%s;",
#        "molecule_type=%s;organism_name=%s;translation_table=%s",
#        "topology=%s;localization=%s;Dbxref=%s")
        
#    descr = sprintf(descr_template,
#                    ch$ID, ch$Name, ch$description, ch$size, ch$web_id,
#                    ch$molecule_type, ch$organism_name, ch$translation_table,
#                    ch$topology, ch$localization, 
    
#                    paste(ch$Dbxref[[1]][1], ch$Dbxref[[1]][2], sep=','))
#    gff_lines = append(gff_lines, paste(ch$Name, "TriTrypDB", "chromosome", 1,
#                                        ch$size, ".", "+", ".", descr,
#                                        sep='\t'))
#}

# keep track of the number of plots created
num_diff = 0
plot_num = 0

# Add gene entries
i = 1
for (gene_id in genes$ID) {
    message(sprintf("Processing Poly(A) sites for gene %d/%d", i, length(genes)))
    gene = genes[genes$ID == gene_id]

    gene_strand = as.character(strand(gene)) 

    # get all of the Poly(A) acceptor sites as a GRanges object
    metacyclic_utr3 = metacyclic_polya[metacyclic_polya$Name == gene_id]
    procyclic_utr3  = procyclic_polya[procyclic_polya$Name == gene_id]
    amastigote_utr3 = amastigote_polya[amastigote_polya$Name == gene_id]
    combined_utr3   = metacyclic_polya[metacyclic_polya$Name == gene_id]

    # total number of reads found which contain an acceptor site
    metacyclic_num_reads = append(metacyclic_num_reads, 
                                  sum(metacyclic_utr3$score))
    procyclic_num_reads  = append(procyclic_num_reads,
                                  sum(procyclic_utr3$score))
    amastigote_num_reads  = append(amastigote_num_reads,
                                  sum(amastigote_utr3$score))
    combined_num_reads   = append(combined_num_reads,
                                  sum(procyclic_utr3$score) +
                                  sum(metacyclic_utr3$score) +
                                  sum(amastigote_utr3$score))
    
    # Combined output
    if (length(procyclic_utr3) > 0) {
        for (j in 1:length(procyclic_utr3)) {
            # if new site, add a new entry
            entry = procyclic_utr3[j]
            if(!start(entry) %in% start(combined_utr3)) {
                combined_utr3 = c(combined_utr3, entry)
            }
            # otherwise add procyclic score to existing metacyclic score
            else {
                score(combined_utr3[start(combined_utr3) == start(entry)]) = (
                    score(combined_utr3[start(combined_utr3) == start(entry)]) +
                    score(entry))
            }
        }
    }
    if (length(amastigote_utr3) > 0) {
        for (j in 1:length(amastigote_utr3)) {
            # if new site, add a new entry
            entry = amastigote_utr3[j]
            if(!start(entry) %in% start(combined_utr3)) {
                combined_utr3 = c(combined_utr3, entry)
            }
            # otherwise add amastigote score to existing metacyclic score
            else {
                score(combined_utr3[start(combined_utr3) == start(entry)]) = (
                    score(combined_utr3[start(combined_utr3) == start(entry)]) +
                    score(entry))
            }
        }
    }
    i = i + 1

    # Determine length and scores for procyclic, metacyclic, and combined
    # outputs
    pro_primary_site      = find_primary_site(procyclic_utr3, 'polya', gene, gene_strand)
    meta_primary_site     = find_primary_site(metacyclic_utr3, 'polya', gene, gene_strand)
    amast_primary_site    = find_primary_site(amastigote_utr3, 'polya', gene, gene_strand)
    combined_primary_site = find_primary_site(combined_utr3, 'polya', gene, gene_strand)

    # secondary sites
    pro_secondary_site      = find_secondary_site(procyclic_utr3, 'polya', gene, gene_strand)
    meta_secondary_site     = find_secondary_site(metacyclic_utr3, 'polya', gene, gene_strand)
    amast_secondary_site    = find_secondary_site(amastigote_utr3, 'polya', gene, gene_strand)
    combined_secondary_site = find_secondary_site(combined_utr3, 'polya', gene, gene_strand)

    # compute 3'utr length and coordinates
    if (gene_strand == '+') {
        # procylic + strand
        procyclic_utr3_length = pro_primary_site$location - end(gene)

        # metacyclic + strand
        metacyclic_utr3_length = meta_primary_site$location - end(gene)
        
        # amastigote + strand
        amastigote_utr3_length = amast_primary_site$location - end(gene)

        # combined + strand
        combined_utr3_start  = end(gene) + 1
        combined_utr3_end    = combined_primary_site$location - 1
        combined_utr3_length = combined_primary_site$location - end(gene)
    } else {
        # procyclic - strand
        procyclic_utr3_length = start(gene) - pro_primary_site$location

        # metacyclic - strand
        metacyclic_utr3_length = start(gene) - meta_primary_site$location

        # amastigote - strand
        amastigote_utr3_length = start(gene) - amast_primary_site$location
        
        # combined - strand
        combined_utr3_start  = combined_primary_site$location + 1
        combined_utr3_end    = start(gene) - 1
        combined_utr3_length = start(gene) - combined_primary_site$location
    }

    # Add primary site read count and UTR length
    procyclic_lengths  = append(procyclic_lengths,  procyclic_utr3_length)
    metacyclic_lengths = append(metacyclic_lengths, metacyclic_utr3_length)
    amastigote_lengths = append(amastigote_lengths, amastigote_utr3_length)
    combined_lengths   = append(combined_lengths,   combined_utr3_length)

    procyclic_num_reads_primary  = append(procyclic_num_reads_primary,
                                          pro_primary_site$num_reads)
    metacyclic_num_reads_primary = append(metacyclic_num_reads_primary,
                                          meta_primary_site$num_reads)
    amastigote_num_reads_primary = append(amastigote_num_reads_primary,
                                          amast_primary_site$num_reads)
    combined_num_reads_primary   = append(combined_num_reads_primary,
                                          combined_primary_site$num_reads)

    procyclic_num_reads_secondary  = append(procyclic_num_reads_secondary,
                                          pro_secondary_site$num_reads)
    metacyclic_num_reads_secondary = append(metacyclic_num_reads_secondary,
                                          meta_secondary_site$num_reads)
    amastigote_num_reads_secondary = append(amastigote_num_reads_secondary,
                                          amast_secondary_site$num_reads)
    combined_num_reads_secondary   = append(combined_num_reads_secondary,
                                          combined_secondary_site$num_reads)

    # Update counts for cross-stage primary sites
    pro_meta_primary_polya_reads = append(pro_meta_primary_polya_reads,
                                 get_num_reads(procyclic_utr5, meta_primary_site$location))
    pro_amast_primary_polya_reads = append(pro_amast_primary_polya_reads,
                                 get_num_reads(procyclic_utr5, amast_primary_site$location))
    meta_pro_primary_polya_reads = append(meta_pro_primary_polya_reads,
                                 get_num_reads(metacyclic_utr5, pro_primary_site$location))
    meta_amast_primary_polya_reads = append(meta_amast_primary_polya_reads,
                                 get_num_reads(metacyclic_utr5, amast_primary_site$location))
    amast_pro_primary_polya_reads = append(amast_pro_primary_polya_reads,
                                 get_num_reads(amastigote_utr5, pro_primary_site$location))
    amast_meta_primary_polya_reads = append(amast_meta_primary_polya_reads,
                                 get_num_reads(amastigote_utr5, meta_primary_site$location))

    # Add GFF entry
    descr = sprintf("ID=%s_3utr;Name=%s;description=%s", gene$ID, gene$ID,
                    gene$description)

    gff_entry = paste(
        seqnames(gene),
        "El-Sayed",
        "three_prime_UTR",
        combined_utr3_start,
        combined_utr3_end,
        combined_primary_site$num_reads,
        strand(gene),
        '.',
        descr, sep='\t')

    gff_lines = append(gff_lines, gff_entry)
}

# metacyclic
metacyclic_utr3_df = data.frame(
    name=genes$ID,
    length=metacyclic_lengths,
    num_reads=metacyclic_num_reads,
    num_reads_primary=metacyclic_num_reads_primary,
    num_reads_secondary=metacyclic_num_reads_secondary
)

# procyclic
procyclic_utr3_df = data.frame(
    name=genes$ID,
    length=procyclic_lengths,
    num_reads=procyclic_num_reads,
    num_reads_primary=procyclic_num_reads_primary,
    num_reads_secondary=procyclic_num_reads_secondary
)

# amastigote
amastigote_utr3_df = data.frame(
    name=genes$ID,
    length=amastigote_lengths,
    num_reads=amastigote_num_reads,
    num_reads_primary=amastigote_num_reads_primary,
    num_reads_secondary=amastigote_num_reads_secondary
)

# combined
combined_utr3_df = data.frame(
    name=genes$ID,
    length=combined_lengths,
    num_reads=combined_num_reads,
    num_reads_primary=combined_num_reads_primary,
    num_reads_secondary=combined_num_reads_secondary
)

# Create cross-stage site usage dataframes
pro_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   metacyclic=pro_meta_primary_polya_reads,
   amastigote=pro_amast_primary_polya_reads
))

meta_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=meta_pro_primary_polya_reads,
   amastigote=meta_amast_primary_polya_reads
))

amast_other_polya_sites = tbl_df(data.frame(
   gene=genes$ID,
   procyclic=amast_pro_primary_polya_reads,
   metacyclic=amast_meta_primary_polya_reads
))
```

##### 3'UTR Composition

```{r utr3_composition}
# Metacyclic
metacyclic_utr3_sequences = get_utr_sequences(genes, fasta, metacyclic_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(metacyclic_utr3_sequences)[,1:4] 
metacyclic_utr3_features_df = data.frame(
    name=names(metacyclic_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
metacyclic_utr3_df = merge(metacyclic_utr3_df, metacyclic_utr3_features_df,
                           by='name')

# Procyclic
procyclic_utr3_sequences = get_utr_sequences(genes, fasta, procyclic_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(procyclic_utr3_sequences)[,1:4] 
procyclic_utr3_features_df = data.frame(
    name=names(procyclic_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
procyclic_utr3_df = merge(procyclic_utr3_df, procyclic_utr3_features_df,
                          by='name')

# amastigote
amastigote_utr3_sequences = get_utr_sequences(genes, fasta, amastigote_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(amastigote_utr3_sequences)[,1:4] 
amastigote_utr3_features_df = data.frame(
    name=names(amastigote_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
amastigote_utr3_df = merge(amastigote_utr3_df, amastigote_utr3_features_df,
                          by='name')
# Combined
combined_utr3_sequences = get_utr_sequences(genes, fasta, combined_utr3_df, 
                                              default_3utr_width, utr5=FALSE)
freqs = alphabetFrequency(combined_utr3_sequences)[,1:4] 
combined_utr3_features_df = data.frame(
    name=names(combined_utr3_sequences),
    gc=(freqs[,'G'] + freqs[,'C']) / rowSums(freqs),
    ct=(freqs[,'C'] + freqs[,'T']) / rowSums(freqs)
)
combined_utr3_df = merge(combined_utr3_df, combined_utr3_features_df,
                         by='name')
```

```{r utr3_output}
# Write results
write.csv(metacyclic_utr3_df, file='output/lmajor_metacyclic_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(procyclic_utr3_df, file='output/lmajor_procyclic_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(amastigote_utr3_df, file='output/lmajor_amastigote_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)
write.csv(combined_utr3_df, file='output/lmajor_combined_3utr_lengths.csv',
          quote=FALSE, row.names=FALSE)

# Write result to GFF
fp = file("output/lmajor_3utr.gff")
writeLines(gff_lines, fp)
close(fp)
```

Results
-------

### Length statistics

#### 5'UTR length statistics

```{r 5utr_length_stats}
lengths = combined_utr5_df$length[!is.na(combined_utr5_df$length)]

coverage = sum(!is.na(combined_utr5_df$length)) / nrow(combined_utr5_df)
print(sprintf("%% 5'UTRs detected: %0.2f", coverage))

quantile(lengths)
mean(lengths)
median(lengths)
mode(lengths)
hist(lengths)
```

#### 3'UTR length statistics

```{r 3utr_length_stats}
lengths = combined_utr3_df$length[!is.na(combined_utr3_df$length)]

coverage = sum(!is.na(combined_utr3_df$length)) / nrow(combined_utr3_df)
print(sprintf("%% 3'UTRs detected: %0.2f", coverage))

quantile(lengths)
mean(lengths)
median(lengths)
mode(lengths)
hist(lengths)
```

### Alternative trans-splicing

#### Metacyclic vs. Procyclic

```{r alt_trans_splicing_meta_vs_pro}
# rescale to range 0-1
# if trimmed=TRUE, outliers will be trimmed first to reduce their influence on
# the resulting colorscale
rescale = function (x, trimmed=TRUE, trim_quantile=0.99) {
    if (trimmed) {
        lower_bound = quantile(x, 1 - trim_quantile)
        upper_bound = quantile(x, trim_quantile)
   
        x = pmax(pmin(x, upper_bound), lower_bound)

        return(pmax(0, ((x-min(x)) / (max(x) - min(x)))))
    }
    # no trimming
    return(pmax(0, ((x-min(x)) / (max(x) - min(x)))))
}

# metacyclic vs. procyclic
# For the purposes of comparing usage ratios, we will treat non-covered sites
# as having one read mapped
meta_pro_5utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr5_df$name,
    metacyclic_len=metacyclic_utr5_df$length,
    procyclic_len=procyclic_utr5_df$length,
    average_primary_site_num_reads=(metacyclic_utr5_df$num_reads_primary +
                                    procyclic_utr5_df$num_reads_primary) / 2,
    average_ptos_ratio=((metacyclic_utr5_df$num_reads_primary /
                         metacyclic_utr5_df$num_reads_secondary) + 
                        (procyclic_utr5_df$num_reads_primary /
                         procyclic_utr5_df$num_reads_secondary)) / 2,
    count_average=(metacyclic_utr5_df$num_reads_primary +
                   procyclic_utr5_df$num_reads_primary) / 2,
    meta_pro_ratio_meta_samples=(metacyclic_utr5_df$num_reads_primary /
                           pmax(meta_other_sl_sites$procyclic, 1)),
    pro_meta_ratio_pro_samples=(procyclic_utr5_df$num_reads_primary / 
                          pmax(pro_other_sl_sites$metacyclic, 1))
))

# version with reads mapped for stages of interest
meta_pro_5utr_comparison_complete = meta_pro_5utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(procyclic_len)) %>%
    mutate(
        log_average_ratio=log2(0.5 * (meta_pro_ratio_meta_samples + pro_meta_ratio_pro_samples)),
        length_diff=abs(metacyclic_len - procyclic_len), 
        log_average_reads=log2(average_primary_site_num_reads),
        log_average_ptos=log2(average_ptos_ratio),
        log_length_diff=log2(length_diff)
    )

ggplot(meta_pro_5utr_comparison_complete, 
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=average_primary_site_num_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666', lwd=0.5) +
    geom_abline(slope=1, intercept=-300, color='#666666', lwd=0.5) +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 
    #scale_size(trans='log')

# sites with switching only
ggplot(meta_pro_5utr_comparison_complete %>% filter(metacyclic_len != procyclic_len),
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666', lwd=0.5) +
    geom_abline(slope=1, intercept=-300, color='#666666', lwd=0.5) +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))

# print top 25 genes with strongest site switching behavior
meta_pro_5utr_diff = meta_pro_5utr_comparison_complete %>% 
    filter(count_average & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_pro_5utr_diff  %>% arrange(-score), 25))

print(sprintf("Number of alternatively trans-spliced genes (meta vs. pro): %d",
              nrow(meta_pro_5utr_diff)))
```

#### Metacyclic vs. Amastigote

```{r alt_trans_splicing_meta_vs_amast}
# metacyclic vs. amastigote
meta_amast_5utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr5_df$name,
    metacyclic_len=metacyclic_utr5_df$length,
    amastigote_len=amastigote_utr5_df$length,
    average_primary_site_num_reads=abs(metacyclic_utr5_df$num_reads_primary -
                   amastigote_utr5_df$num_reads_primary),
    average_ptos_ratio=((metacyclic_utr5_df$num_reads_primary /
                         metacyclic_utr5_df$num_reads_secondary) + 
                        (amastigote_utr5_df$num_reads_primary /
                         amastigote_utr5_df$num_reads_secondary)) / 2,
    meta_amast_ratio_meta_samples=(metacyclic_utr5_df$num_reads_primary /
                           pmax(meta_other_sl_sites$amastigote, 1)),
    amast_meta_ratio_amast_samples=(amastigote_utr5_df$num_reads_primary / 
                          pmax(amast_other_sl_sites$metacyclic, 1))
)) %>% mutate(
    length_diff=abs(metacyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
meta_amast_5utr_comparison_complete = meta_amast_5utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (meta_amast_ratio_meta_samples +
                                          amast_meta_ratio_amast_samples)))

#meta_amast_5utr_comparison_complete %>% 
#    ggvis(~metacyclic_len, ~amastigote_len, fill=~log_average_reads) %>% 
#    layer_points() %>% 
#    add_tooltip(function(df) df$name)
ggplot(meta_amast_5utr_comparison_complete, 
       aes(metacyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") + 
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
meta_amast_5utr_diff = meta_amast_5utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_amast_5utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (meta vs. amast): %d",
              nrow(meta_amast_5utr_diff)))
```

#### Procyclic vs. Amastigote

```{r alt_trans_splicing_pro_vs_amast}
# procyclic vs. amastigote
pro_amast_5utr_comparison = tbl_df(data.frame(
    name=procyclic_utr5_df$name,
    procyclic_len=procyclic_utr5_df$length,
    amastigote_len=amastigote_utr5_df$length,
    average_primary_site_num_reads=abs(procyclic_utr5_df$num_reads_primary -
                   amastigote_utr5_df$num_reads_primary),
    average_ptos_ratio=((procyclic_utr5_df$num_reads_primary /
                         procyclic_utr5_df$num_reads_secondary) + 
                        (amastigote_utr5_df$num_reads_primary /
                         amastigote_utr5_df$num_reads_secondary)) / 2,
    pro_amast_ratio_pro_samples=(procyclic_utr5_df$num_reads_primary /
                          pmax(pro_other_sl_sites$amastigote, 1)),
    amast_pro_ratio_amast_samples=(amastigote_utr5_df$num_reads_primary / 
                            pmax(amast_other_sl_sites$procyclic, 1))
)) %>% mutate(
    length_diff=abs(procyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
pro_amast_5utr_comparison_complete = pro_amast_5utr_comparison %>%
    filter(!is.na(procyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (pro_amast_ratio_pro_samples +
                                          amast_pro_ratio_amast_samples)))

ggplot(pro_amast_5utr_comparison_complete, 
       aes(procyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") + 
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
pro_amast_5utr_diff = pro_amast_5utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(pro_amast_5utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (pro vs. amast): %d",
              nrow(pro_amast_5utr_diff)))
```

### Alternative poly-adenylation

#### Metacyclic vs. Procyclic

```{r alt_polya_meta_vs_pro}
# metacyclic vs. procyclic
# For the purposes of comparing usage ratios, we will treat non-covered sites
# as having one read mapped
meta_pro_3utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr3_df$name,
    metacyclic_len=metacyclic_utr3_df$length,
    procyclic_len=procyclic_utr3_df$length,
    count_average=(metacyclic_utr3_df$num_reads_primary +
                   procyclic_utr3_df$num_reads_primary) / 2,
    average_ptos_ratio=((metacyclic_utr3_df$num_reads_primary /
                         metacyclic_utr3_df$num_reads_secondary) + 
                        (procyclic_utr3_df$num_reads_primary /
                         procyclic_utr3_df$num_reads_secondary)) / 2,
    average_primary_site_num_reads=abs(metacyclic_utr3_df$num_reads_primary -
                   procyclic_utr3_df$num_reads_primary),
    meta_pro_ratio_meta_samples=(metacyclic_utr3_df$num_reads_primary /
                           pmax(meta_other_polya_sites$procyclic, 1)),
    pro_meta_ratio_pro_samples=(procyclic_utr3_df$num_reads_primary / 
                          pmax(pro_other_polya_sites$metacyclic, 1))
))

# version with reads mapped for stages of interest
meta_pro_3utr_comparison_complete = meta_pro_3utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(procyclic_len)) %>%
    mutate(
        log_average_ratio=log2(0.5 * (meta_pro_ratio_meta_samples + pro_meta_ratio_pro_samples)),
        length_diff=abs(metacyclic_len - procyclic_len), 
        log_average_reads=log2(average_primary_site_num_reads),
        log_average_ptos=log2(average_ptos_ratio),
        log_length_diff=log2(length_diff)
    )

ggplot(meta_pro_3utr_comparison_complete, 
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=average_primary_site_num_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# sites with switching only
ggplot(meta_pro_3utr_comparison_complete %>% filter(metacyclic_len != procyclic_len),
       aes(metacyclic_len, procyclic_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))

# print top 25 genes with strongest site switching behavior
meta_pro_3utr_diff = meta_pro_3utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_pro_3utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (meta vs. pro): %d",
              nrow(meta_pro_3utr_diff)))
```

#### Metacyclic vs. Amastigote

```{r alt_polya_meta_vs_amast}
# metacyclic vs. amastigote
meta_amast_3utr_comparison = tbl_df(data.frame(
    name=metacyclic_utr3_df$name,
    metacyclic_len=metacyclic_utr3_df$length,
    amastigote_len=amastigote_utr3_df$length,
    average_primary_site_num_reads=abs(metacyclic_utr3_df$num_reads_primary -
                   amastigote_utr3_df$num_reads_primary),
    average_ptos_ratio=((metacyclic_utr3_df$num_reads_primary /
                         metacyclic_utr3_df$num_reads_secondary) + 
                        (amastigote_utr3_df$num_reads_primary /
                         amastigote_utr3_df$num_reads_secondary)) / 2,
    meta_amast_ratio_meta_samples=(metacyclic_utr3_df$num_reads_primary /
                           pmax(meta_other_polya_sites$amastigote, 1)),
    amast_meta_ratio_amast_samples=(amastigote_utr3_df$num_reads_primary / 
                            pmax(amast_other_polya_sites$metacyclic, 1))
)) %>% mutate(
    length_diff=abs(metacyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
meta_amast_3utr_comparison_complete = meta_amast_3utr_comparison %>%
    filter(!is.na(metacyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (meta_amast_ratio_meta_samples +
                                   amast_meta_ratio_amast_samples)))

ggplot(meta_amast_3utr_comparison_complete, 
       aes(metacyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
meta_amast_3utr_diff = meta_amast_3utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(meta_amast_3utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (meta vs. amast): %d",
              nrow(meta_amast_3utr_diff)))
```

#### Procyclic vs. Amastigote

```{r alt_polya_pro_vs_amast}
# procyclic vs. amastigote
pro_amast_3utr_comparison = tbl_df(data.frame(
    name=procyclic_utr3_df$name,
    procyclic_len=procyclic_utr3_df$length,
    amastigote_len=amastigote_utr3_df$length,
    average_primary_site_num_reads=abs(procyclic_utr3_df$num_reads_primary -
                   amastigote_utr3_df$num_reads_primary),
    average_ptos_ratio=((procyclic_utr3_df$num_reads_primary /
                         procyclic_utr3_df$num_reads_secondary) + 
                        (amastigote_utr3_df$num_reads_primary /
                         amastigote_utr3_df$num_reads_secondary)) / 2,
    pro_amast_ratio_pro_samples=(procyclic_utr3_df$num_reads_primary /
                           pmax(pro_other_polya_sites$amastigote, 1)),
    amast_pro_ratio_amast_samples=(amastigote_utr3_df$num_reads_primary / 
                          pmax(amast_other_polya_sites$procyclic, 1))
)) %>% mutate(
    length_diff=abs(procyclic_len - amastigote_len), 
    log_average_reads=log1p(average_primary_site_num_reads),
    log_average_ptos=log1p(average_ptos_ratio),
    log_length_diff=log1p(length_diff)
)

# version with reads mapped for stages of interest
pro_amast_3utr_comparison_complete = pro_amast_3utr_comparison %>%
    filter(!is.na(procyclic_len) & !is.na(amastigote_len)) %>%
    mutate(log_average_ratio=log1p(0.5 * (pro_amast_ratio_pro_samples +
                                amast_pro_ratio_amast_samples)))

ggplot(pro_amast_3utr_comparison_complete, 
       aes(procyclic_len, amastigote_len, color=log_average_ptos,
           size=log_average_reads)) + 
    geom_point() +
    scale_color_gradient2(low="black", mid="blue", high="red") +
    geom_abline(slope=1, intercept=300, color='#666666') +
    geom_abline(slope=1, intercept=-300, color='#666666') +
    scale_x_continuous(expand = c(0.01, 0.01)) +
    scale_y_continuous(expand = c(0.01, 0.01))
    #geom_segment(aes(x=300, y=0, xend=6000, yend=5700, color='#666666'))
    #scale_color_gradient2(low="black", mid="blue", high="red", trans='log')
    #scale_colour_continuous(high='red') 

# print top 25 genes with strongest site switching behavior
pro_amast_3utr_diff = pro_amast_3utr_comparison_complete %>% 
    filter(length_diff > 300 & log_average_ratio > 0.75) %>%
    select(-log_length_diff)
#kable(head(pro_amast_3utr_diff  %>% arrange(-score), 25))
print(sprintf("Number of alternatively trans-spliced genes (pro vs. amast): %d",
              nrow(pro_amast_3utr_diff)))
```

System Information
------------------

```{r sysinfo}
sessionInfo()
date()
```