-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
184 lines (173 loc) · 7.48 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/Rscript
## Author: Taylor Falk
## BU BF591
## Assignment Week 2
#### Bioconductor ####
# it is standard among R packages to define libraries and packages at the
# beginning of a script. Also note that a package should NOT be installed every
# time a script runs.
# The bioconductor repository has installation instructions for biomaRt:
# https://bioconductor.org/install/
# load tidyverse and your new bioconductor package
library(tidyr)
library(biomaRt)
library(readr)
library(dplyr)
library(tibble)
library(stringr)
library(tidyverse)
library(ggplot2)
#### Loading and processing data ####
#' Load Expression Data
#'
#' @param filepath A text string of the full filepath to the file to load.
#'
#' @return A tibble containing the data loaded from the CSV in `filepath`.
#'
#' @details Note that not all CSVs are created equal, and there are often cases where
#' the data will not load in correctly on the first try. You may want to write this functon to
#' adjust the CSV file being loaded into this assignment so that it can be formed into a
#' tibble correctly.
#'
#' @examples
#' `data <- load_expression('/project/bf528/project_1/data/example_intensity_data.csv')`
load_expression <- function(filepath) {
result = read_delim(filepath, delim=' ', col_names=TRUE)
return(result)
}
#' Filter 15% of the gene expression values.
#'
#' @param tibble A tibble of expression values, rows by probe and columns by sample.
#'
#' @return A tibble of affymetrix probe names from the input expression data tibble.
#' These names match the rows with 15% or more of the expression values above log2(15).
#'
#' @details This is similar to the filters being implemented in BF528's project 1.
#' We do not necessarily want to capture all parts of the assay in our analysis, so
#' filters like this serve to reduce the noise and amount of data to examine.
#'
#' @examples `samples <- filter_15(data_tib)`
#' `> str(samples)`
#' `tibble [40,158 × 1] (S3: tbl_df/tbl/data.frame)`
#' `$ probeids: chr [1:40158] "1007_s_at" "1053_at" "117_at" "121_at" ...`
filter_15 <- function(tibble){
result <- dplyr::mutate(tibble, count = rowSums(tibble > log(15,2)))
filt_expr <- dplyr::filter(result, count > 5.0)
probe_vec <- filt_expr$probeids
return(as_tibble(probe_vec))
}
# this gives me 40,160 rows, 2 more than expected
#### Gene name conversion ####
#' Convert affymetrix array names into hgnc_symbol IDs using biomaRt. Inputs and
#' outputs will likely not be the same size.
#'
#' @param affy_tib A single column tibble of strings containing array names.
#'
#' @return A 2 column tibble that contains affy IDs in the first column,
#' and their corresponding HGNC gene ID in the second column. Note that not all affy IDs
#' will necessarily correspond with a gene ID, and one gene may have multiple affy IDs.
#'
#' @details Connecting to ensembl via biomaRt can be...hit or miss...so you may
#' want to check if data was correctly returned (or if it was just empty). The
#' `getBM()` function may not accept a tibble, so you might need to convert your
#' input into a flat vector.
#'
#' @examples
#' `> affy_to_hgnc(tibble(c('202860_at', '1553551_s_at')))`
#' `affy_hg_u133_plus_2 hgnc_symbol`
#' `1 1553551_s_at MT-ND1`
#' `2 1553551_s_at MT-TI`
#' `3 1553551_s_at MT-TM`
#' `4 1553551_s_at MT-ND2`
#' `5 202860_at DENND4B`
affy_to_hgnc <- function(affy_vector) {
ensembl <- useEnsembl(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
mirror = 'useast')
BM_result <- getBM(attributes = c("affy_hg_u133_plus_2", "hgnc_symbol"),
filters = 'affy_hg_u133_plus_2',
values = affy_vector,
mart = ensembl)
return(BM_result)
}
#### ggplot ####
#' Reduce a tibble of expression data to only the rows in good_genes or bad_genes.
#'
#' @param expr_tibble A tibble holding the expression data, each row corresponding to
#' one affymetrix probe ID and each column to a sample.
#' @param names_ids A two column tibble that associates affy IDs with HGNC gene IDs.
#' Generated `with affy_to_hgnc()`.
#' @param good_genes A list of gene names stored as a vector of strings.
#' @param bad_genes A list of gene names stored as a vector of strings.
#'
#' @return A tibble with two additional columns added:
#' 1. HGNC gene IDs
#' 2. Does the gene is this row fall into "good" or "bad" genes?
#' This tibble should be reduced to only rows present in good or bad genes. All
#' other rows can be discarded.
#'
#' @details In order to plot only our genes of interest, we need to rearrange our
#' data to include only the elements we want to see. We also want to add to columns,
#' one that associates the probeids with the HGNC gene name, and one that says if
#' that gene is in the good or bad sets of genes.
#'
#' @examples
#' `plot_tibble <- reduce_data(expr_tibble = expr, names_ids = sample_names,`
#' ` goodGenes, badGenes)`
#' `> head(plot_tibble)`
#' `A tibble: 6 × 38`
#' ` probeids hgnc gene_set GSM972389 ...`
#' ` <chr> <chr> <chr> <dbl> ...`
#' `1 202860_at DENND4B good 7.16 ...`
#' `2 204340_at TMEM187 good 6.40 ...`
reduce_data <- function(expr_tibble, names_ids, good_genes, bad_genes){
good_bad <- c(good_genes, bad_genes)
# vector of good and bad gene HGNCs
sel_probe_indices <- which(names_ids$hgnc_symbol %in% good_bad)
# vector of indices in sample_name where we find good and bad genes
good_bad_probes <- names_ids %>% slice(sel_probe_indices)
# make a slice of sample_names with only the selected rows
sel_expr_row_indices <- which(expr_tibble$probeids %in% good_bad_probes$affy_hg_u133_plus_2)
# vector of indices in expr dataframe where probe ID matches good/bad genes
sel_filt_expr <- expr_tibble %>% slice(sel_expr_row_indices)
# tibble of expression data and probe ids for only the selected probes
sel_filt_expr <- add_column(sel_filt_expr, hgnc = good_bad_probes$hgnc_symbol, .after = 'probeids') #
bool_vec <- character()
for (i in sel_filt_expr$hgnc){
if (i %in% good_genes){
bool_vec <- append(bool_vec, 'Good')
}
else{
bool_vec <- append(bool_vec, 'Bad')
}
}
result_tibble <- add_column(sel_filt_expr, gene_set = bool_vec, .after = 'hgnc')
result_tibble <- dplyr::arrange(result_tibble,gene_set)
return(result_tibble)
}
#' Plot a boxplot of good and bad genes.
#'
#' @param tibble A reduced tibble of expression data, with information about
#' good and bad genes and gene names.
#'
#' @return A ggplot object which contains a boxplot of the genes and samples we
#' are interested in.
#'
#' @details This function performs one additional step before using `ggplot()`:
#' converting the _wide_ format of the input tibble to a _long_ format.
#'
#' @examples `p <- plot_ggplot(plot_tibble)`
plot_ggplot <- function(tibble) {
all_col <- colnames(tibble)
sample_columns <- all_col[4:38]
PIVOT = tidyr::pivot_longer(tibble, sample_columns, names_to='Sample', values_to='Expression')
PIVOT_new <- dplyr::arrange(PIVOT, gene_set)
PIVOT %>% mutate(hgnc = factor(PIVOT$hgnc, levels = unique(PIVOT_new$hgnc), order=T)) -> PIVOT
ggplot(data=PIVOT, mapping=aes(x=hgnc, y=Expression, fill=gene_set)) +
geom_boxplot() +
scale_fill_manual(breaks=PIVOT$gene_set, values=c("#DF2935", "#71B48D")) +
labs(title = "Expression of 'good' and 'bad' Genes") +
theme(axis.text = element_text(angle = 45)) +
theme(plot.title = element_text(hjust = 0.5))
}