main.R

#!/usr/bin/Rscript
## Author: Taylor Falk
## tfalk@bu.edu
## BU BF591
## Assignment Week 2

#### Bioconductor ####
# it is standard among R packages to define libraries and packages at the 
# beginning of a script. Also note that a package should NOT be installed every 
# time a script runs.
# The bioconductor repository has installation instructions for biomaRt: 
# https://bioconductor.org/install/


# load tidyverse and your new bioconductor package
library(tidyr)
library(biomaRt)
library(readr)
library(dplyr)
library(tibble)
library(stringr)
library(tidyverse)
library(ggplot2)
#### Loading and processing data ####
#' Load Expression Data
#'
#' @param filepath A text string of the full filepath to the file to load.
#'
#' @return A tibble containing the data loaded from the CSV in `filepath`. 
#' 
#' @details Note that not all CSVs are created equal, and there are often cases where 
#' the data will not load in correctly on the first try. You may want to write this functon to 
#' adjust the CSV file being loaded into this assignment so that it can be formed into a 
#' tibble correctly.
#'
#' @examples 
#' `data <- load_expression('/project/bf528/project_1/data/example_intensity_data.csv')`
load_expression <- function(filepath) {
  result = read_delim(filepath, delim=' ', col_names=TRUE)
  return(result)
}

#' Filter 15% of the gene expression values.
#'
#' @param tibble A tibble of expression values, rows by probe and columns by sample.
#'
#' @return A tibble of affymetrix probe names from the input expression data tibble. 
#' These names match the rows with 15% or more of the expression values above log2(15).
#' 
#' @details This is similar to the filters being implemented in BF528's project 1. 
#' We do not necessarily want to capture all parts of the assay in our analysis, so 
#' filters like this serve to reduce the noise and amount of data to examine.
#'
#' @examples `samples <- filter_15(data_tib)`
#' `> str(samples)`
#' `tibble [40,158 × 1] (S3: tbl_df/tbl/data.frame)`
#' `$ probeids: chr [1:40158] "1007_s_at" "1053_at" "117_at" "121_at" ...`
filter_15 <- function(tibble){
  result <- dplyr::mutate(tibble, count = rowSums(tibble > log(15,2)))
  filt_expr <- dplyr::filter(result, count > 5.0)
  probe_vec <- filt_expr$probeids
  return(as_tibble(probe_vec))
}
# this gives me 40,160 rows, 2 more than expected

#### Gene name conversion ####

#' Convert affymetrix array names into hgnc_symbol IDs using biomaRt. Inputs and 
#' outputs will likely not be the same size.
#'
#' @param affy_tib A single column tibble of strings containing array names.
#'
#' @return A 2 column tibble that contains affy IDs in the first column,
#' and their corresponding HGNC gene ID in the second column. Note that not all affy IDs 
#' will necessarily correspond with a gene ID, and one gene may have multiple affy IDs.
#' 
#' @details Connecting to ensembl via biomaRt can be...hit or miss...so you may 
#' want to check if data was correctly returned (or if it was just empty). The 
#' `getBM()` function may not accept a tibble, so you might need to convert your 
#' input into a flat vector.
#'
#' @examples 
#' `> affy_to_hgnc(tibble(c('202860_at', '1553551_s_at')))`
#' `affy_hg_u133_plus_2 hgnc_symbol`
#' `1        1553551_s_at      MT-ND1`
#' `2        1553551_s_at       MT-TI`
#' `3        1553551_s_at       MT-TM`
#' `4        1553551_s_at      MT-ND2`
#' `5           202860_at     DENND4B`
affy_to_hgnc <- function(affy_vector) {
  ensembl <- useEnsembl(biomart = "ENSEMBL_MART_ENSEMBL", 
                        dataset = "hsapiens_gene_ensembl",
                        mirror = 'useast')
  BM_result <- getBM(attributes = c("affy_hg_u133_plus_2", "hgnc_symbol"),
        filters = 'affy_hg_u133_plus_2',
        values = affy_vector, 
        mart = ensembl)
  return(BM_result)
}

#### ggplot ####

#' Reduce a tibble of expression data to only the rows in good_genes or bad_genes.
#'
#' @param expr_tibble A tibble holding the expression data, each row corresponding to
#' one affymetrix probe ID and each column to a sample.
#' @param names_ids A two column tibble that associates affy IDs with HGNC gene IDs. 
#' Generated `with affy_to_hgnc()`.
#' @param good_genes A list of gene names stored as a vector of strings.
#' @param bad_genes A list of gene names stored as a vector of strings.
#'
#' @return A tibble with two additional columns added:
#' 1. HGNC gene IDs 
#' 2. Does the gene is this row fall into "good" or "bad" genes?
#' This tibble should be reduced to only rows present in good or bad genes. All
#' other rows can be discarded.
#' 
#' @details In order to plot only our genes of interest, we need to rearrange our 
#' data to include only the elements we want to see. We also want to add to columns, 
#' one that associates the probeids with the HGNC gene name, and one that says if 
#' that gene is in the good or bad sets of genes.
#'
#' @examples 
#' `plot_tibble <- reduce_data(expr_tibble = expr, names_ids = sample_names,`
#' `                           goodGenes, badGenes)`
#' `> head(plot_tibble)`
#' `A tibble: 6 × 38`
#' `  probeids    hgnc    gene_set    GSM972389 ...`
#' `  <chr>       <chr>   <chr>       <dbl>     ...`
#' `1 202860_at   DENND4B good        7.16      ...`
#' `2 204340_at   TMEM187 good        6.40      ...`
reduce_data <- function(expr_tibble, names_ids, good_genes, bad_genes){
  good_bad <- c(good_genes, bad_genes) 
    # vector of good and bad gene HGNCs
  sel_probe_indices <- which(names_ids$hgnc_symbol %in% good_bad)
    # vector of indices in sample_name where we find good and bad genes
  good_bad_probes <- names_ids %>% slice(sel_probe_indices)
    # make a slice of sample_names with only the selected rows
  sel_expr_row_indices <- which(expr_tibble$probeids %in% good_bad_probes$affy_hg_u133_plus_2)
    # vector of indices in expr dataframe where probe ID matches good/bad genes
  sel_filt_expr <- expr_tibble %>% slice(sel_expr_row_indices)
    # tibble of expression data and probe ids for only the selected probes
  sel_filt_expr <- add_column(sel_filt_expr, hgnc = good_bad_probes$hgnc_symbol, .after = 'probeids') #
  bool_vec <- character()
  for (i in sel_filt_expr$hgnc){
    if (i %in% good_genes){
      bool_vec <- append(bool_vec, 'Good')
    }
    else{
      bool_vec <- append(bool_vec, 'Bad')
    }
  }
  result_tibble <- add_column(sel_filt_expr, gene_set = bool_vec, .after = 'hgnc')
  result_tibble <- dplyr::arrange(result_tibble,gene_set)
  return(result_tibble)
}


#' Plot a boxplot of good and bad genes.
#'
#' @param tibble A reduced tibble of expression data, with information about
#' good and bad genes and gene names.
#'
#' @return A ggplot object which contains a boxplot of the genes and samples we 
#' are interested in.
#' 
#' @details This function performs one additional step before using `ggplot()`: 
#' converting the _wide_ format of the input tibble to a _long_ format.
#'
#' @examples `p <- plot_ggplot(plot_tibble)`
plot_ggplot <- function(tibble) {
  all_col <- colnames(tibble)
  sample_columns <- all_col[4:38]
  PIVOT = tidyr::pivot_longer(tibble, sample_columns, names_to='Sample', values_to='Expression')
  PIVOT_new <- dplyr::arrange(PIVOT, gene_set)
  PIVOT %>% mutate(hgnc = factor(PIVOT$hgnc, levels = unique(PIVOT_new$hgnc), order=T)) -> PIVOT
  ggplot(data=PIVOT, mapping=aes(x=hgnc, y=Expression, fill=gene_set)) +
    geom_boxplot() +
    scale_fill_manual(breaks=PIVOT$gene_set, values=c("#DF2935", "#71B48D")) +
    labs(title = "Expression of 'good' and 'bad' Genes") + 
    theme(axis.text = element_text(angle = 45)) +
    theme(plot.title = element_text(hjust = 0.5))
}