mothur-data-R-tutorial.R

library(tidyverse)
library(ggplot2)

metadata <- read_excel(path="raw_data/baxter.metadata.xlsx",
                       col_types=c(sample = "text", fit_result = "numeric", Site = "text", Dx_Bin = "text",
                                   dx = "text", Hx_Prev = "logical", Hx_of_Polyps = "logical", Age = "numeric",
                                   Gender = "text", Smoke = "logical", Diabetic = "logical", Hx_Fam_CRC = "logical",
                                   Height = "numeric", Weight = "numeric", NSAID = "logical", Diabetes_Med = "logical",
                                   stage = "text")
)
metadata[["Height"]] <- na_if(metadata[["Height"]], 0)
metadata[["Weight"]] <- na_if(metadata[["Weight"]], 0)
metadata[["Site"]] <- recode(.x=metadata[["Site"]], "U of Michigan"="U Michigan")
metadata[["Dx_Bin"]] <- recode(.x=metadata[["Dx_Bin"]], "Cancer."="Cancer")
metadata[["Gender"]] <- recode(.x=metadata[["Gender"]], "m"="male")
metadata[["Gender"]] <- recode(.x=metadata[["Gender"]], "f"="female")
metadata <- rename_all(.tbl=metadata, .funs=tolower)
metadata <- rename(.data=metadata,
                   previous_history=hx_prev,
                   history_of_polyps=hx_of_polyps,
                   family_history_of_crc=hx_fam_crc,
                   diagnosis_bin=dx_bin,
                   diagnosis=dx,
                   sex=gender)

rarefy <- read.csv('stability.csv',header = T) %>%
  select(-contains("lci-")) %>%
  select(-contains("hci-")) %>% 
  gather(-numSampled,key=sample,value=sobs) %>%
  mutate(sample=str_replace_all(sample,pattern = "0.03-",replacement = "")) %>%
  drop_na()
rarefy
metadata <- get.metadata()
metadata_rarefy <- inner_join(metadata, rarefy)
ggplot(data=rarefy,aes(x=numSampled,y=sobs,group=sample,color=numSampled))+
  geom_line()
#按诊断分组着色
gplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_line()
ggplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_line() +
  scale_color_manual(name=NULL,
                     values=c("blue", "red", "black"),
                     breaks=c("normal", "adenoma", "cancer"),
                     labels=c("Normal", "Adenoma", "Cancer")) +
  labs(title="Rarefaction curves are pretty pointless at this scale",
       x="Number of Sequences Sampled per Subject",
       y="Number of OTUs per Subject") +
  theme_classic()
#设定坐标范围
ggplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_line() +
  scale_color_manual(name=NULL,
                     values=c("blue", "red", "black"),
                     breaks=c("normal", "adenoma", "cancer"),
                     labels=c("Normal", "Adenoma", "Cancer")) +
  coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
  labs(title="Rarefaction curves are pretty pointless at this scale",
       x="Number of Sequences Sampled per Subject",
       y="Number of OTUs per Subject") +
  theme_classic()
#减少示例
set.seed(1) #this makes sure that we all get the same result!
metadata_rarefy_sample <- metadata %>%
  sample_n(10) %>%
  inner_join(., rarefy)

ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_line() +
  scale_color_manual(name=NULL,
                     values=c("blue", "red", "black"),
                     breaks=c("normal", "adenoma", "cancer"),
                     labels=c("Normal", "Adenoma", "Cancer")) +
  coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
  labs(title="Rarefaction curves are pretty pointless at this scale",
       x="Number of Sequences Sampled per Subject",
       y="Number of OTUs per Subject") +
  theme_classic()
#改变线型
ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_line(linetype=5) +
  scale_color_manual(name=NULL,
                     values=c("blue", "red", "black"),
                     breaks=c("normal", "adenoma", "cancer"),
                     labels=c("Normal", "Adenoma", "Cancer")) +
  coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
  labs(title="Rarefaction curves are pretty pointless at this scale",
       x="Number of Sequences Sampled per Subject",
       y="Number of OTUs per Subject") +
  theme_classic()
#添加竖线
ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
  geom_hline(yintercept=200, color="gray", size=2) +
  geom_line() +
  scale_color_manual(name=NULL,
                     values=c("blue", "red", "black"),
                     breaks=c("normal", "adenoma", "cancer"),
                     labels=c("Normal", "Adenoma", "Cancer")) +
  coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
  labs(title="Rarefaction curves are pretty pointless at this scale",
       x="Number of Sequences Sampled per Subject",
       y="Number of OTUs per Subject") +
  theme_classic()


#===============================================================================
#
#	Introduction to R for use with output from mothur and other tools commonly
#	used in computational microbial ecology
#
#	Pat Schloss
#	pschloss@umich.edu
#
#
#	These notes are inspired and loosely built upon a tutorial developed by
#	Pawel Gajer (pgajer@gmail.com) at the University of Maryland Institute for
#	Genomic Sciences
#
#
#	The purpose of this tutorial is to introduce microbial ecologsist to the R
#	statistical programming enviroment.  You should create a folder that will
#	hold the data files that we will use and creat in the tutorial.  I will be
#	assuming that you either have never used R or used it in a rather basic
#	manner.
#
#	At the end you will find a list of resources that you may find useful for
#	learning more about R
#
#	This tutorial can be run by entering the following command within R...
#
#		source(file="tutorial.R", echo=T)
#
#===============================================================================
#
#	Introduction to the R environment
#
#===============================================================================
#	
#	The R environment is a command line where you enter commands.  When you 
#	fire up R, you will see the following:
#	
#		R version 2.10.0 (2009-10-26)
#		Copyright (C) 2009 The R Foundation for Statistical Computing
#		ISBN 3-900051-07-0
#		
#		R is free software and comes with ABSOLUTELY NO WARRANTY.
#		You are welcome to redistribute it under certain conditions.
#		Type 'license()' or 'licence()' for distribution details.
#		
#		  Natural language support but running in an English locale
#		
#		R is a collaborative project with many contributors.
#		Type 'contributors()' for more information and
#		'citation()' on how to cite R or R packages in publications.
#		
#		Type 'demo()' for some demos, 'help()' for on-line help, or
#		'help.start()' for an HTML browser interface to help.
#		Type 'q()' to quit R.
#	
#		>
#
#	The ">" is the prompt where you enter the commands.  I will not be putting
#	the prompt in the tutorial.  Let's get started by entering the following

2+2

#	You will see the following:
#
#		> 2+2
#		[1] 4
#
#	This output tells you the obvious result that 2+2 is 4.  R can be used as an
#	over-grown calculator, but it is so much more than that.  It's really a high
#	level programming language to give you publication worthy graphics and 
#	perform complex statistical analyses.
#
#	If you want to quit R, run the following:
	
#q()

#	You will encounter the follwing prompt:
#
#		Save workspace image? [y/n/c]: 
#
#	If you select "y" then the next time you run R, it will recall your previous
#	commands.

#y

#	Fire up R again and hit the up arrow twice.  You will see the following:
#
#		> 2+2;
#
#	As a short cut you can also type the following to get the same effect when
#	quitting R:

#q("yes")

#	Something to note as you go through the tutorial is the use of the pound (#)
#	symbol.  This indicates that what follows is a comment.  For example:

2+2            #this is a comment

#	Another useful feature is the inline help feature:

?mean

#	This will output a help file for the 'mean' function.  You can scroll
#	through it with the arrow keys as well as the space bar.  At the end of the 
#	help text are one or more examples of how to use the function.  You can
#	execute this example by copying and pasting the text.  Alternatively, you 
#	can use the 'example' command:

example(plot)

#	Not sure what the function is that you want?  Use the '??' function...

??pca

#	This will tell you the list of functions and packages that have some
#	reference to "pca".
#
#	Running "2+2" gets you the answer of "4".  But what if you want to store
#	that value?  Save it as a variable...

x <- 2+2

# 	or

x = 2+2
x

#	The left arrow is the preferred method of assigning values to variables.
#	Now you can manipulate that variable:

x + 2

#	You can run two commands on the same line by separating the commands with a
#	semicolon:

x<-2+2;x+2

#	To see what variables have been declared:

ls()

#	To remove variables you no longer need :

rm(x)
#x

#	Combining the last two commands will remove all of the variables stored in
#	memory:

rm(list=ls())
ls()

#	To keep our examples relevant, let's quickly discuss how to read in a table.
#	The frist two commands below are not necessarily needed if you know you are
#	in the same folder as your data

getwd()          				    # is this the directory we want to be in?
setwd("~/Desktop/IntroToR")         # if not let's change directories
seq.sum <- read.table(file="stool.trim.fasta.summary", header=T)

#	The read.table command will read in your data table and keep the column
#	headings.  These data represent the length of sequece in the Costello stool
#	dataset after removing primers, barcodes, and low quality bases.  To make
#	the data easier to manipulate you can do the following:

colnames(seq.sum)        # see the column heading names
seq.sum$nbasesp[1:10]    # get the column "nbases" out of the table
attach(seq.sum)          # "attach" the table so that you don't need the "$"
nbases[1:10]

#	Don't worry if you don't understand everything in the last several lines.
#	The important thing is to see that there are different ways to get the same
#	output
#
#	Ok, that's enough preliminaries for now.  Let's get going...


#===============================================================================
#
#	Variables
#
#===============================================================================
#
#	Technically speaking everything in R is an object.  That may be a bit into
#	the weeds for most people.  But try to keep up.  There are many types of
#	objects in R.  We'll start with these:
#		* Numeric values
#		* Character values
#		* Logical values
#		* Vectors
#		* Matrices
#	
#
#	--Numeric values--
#	The variable we used above, x, is a numeric value.  Here are some other
#	examples:

x <- 2/3; x
y <- 31.7
z <- 2.1e7                         # same as 2.1*10^7
longAndSilly.variable.name <- pi/3 # an example of a long variable name

#	Although it is a matter of personal style, ideally, your variable names
#	will have some meaning to you.  For example:

genomeSize <- 4.5e6
#16SCopyNumber <- 6           # illegal variable name
rrnCopyNumber <- 6

#	A wide variety of arithmetic functions can be applied to these types of
#	variables:

log(x)           # natural logarithm
log2(x)          # base 2 logarithm
log10(x)         # base 10 logarithm
exp(x)           # exponent
sqrt(x)          # square root
abs(x)           # absolute value
floor(x)         # the largest integers not greater than x
ceiling(x)       # the smallest integers not less than x
x %% 2           # remainder or modulo operator


#	--Character values--
#	Strings of character values are useful for storing text information

A <- "Alanine"; A
R <- "Argenine"; R
N <- "Asparagine"; N

#	The paste function is useful for converting numerical information into a
#	string

x<-3.14159
text<-paste("x=", x, " y=", y, sep="")
text

#	or

text<-paste("x=", format(x, digits=3), " y=", y, sep="")
text



#	--Logical values--
#	Logical variables can have one of two values - TRUE or FALSE

x <- TRUE
y <- FALSE

!x              # NOT operator
x && y          # AND operator
x & y           # bitwise AND operator (mainly for vectors)
x || y          # OR operator
x | y			# bitwise OR operator (mainly for vectors)
x == y          # is equal operator
x != y          # is not equal operator

#	We can also perform logical operators on numerical variables:

x <- 5
y <- 3

x > y          # greater than operator
x >= y         # greater than or equal to operator
x < y          # less than operator
x <= y         # less than  or equal tooperator
x == y         # is equal to operator
x != y         # is not equal to operator

#	We can also convert numerical values to logical valeus

x <- 0
as.logical(x)
as.logical(y)

#	We can also use logical operators on strings

x <- "ATG"
y <- "CCC"

x > y          # greater than operator
x >= y         # greater than or equal to operator
x < y          # less than operator
x <= y         # less than  or equal tooperator
x == y         # is equal to operator
x != y         # is not equal to operator


#	--Vectors--
#	Vectors are a one-dimensional set of values of the same type.  You can read
#	in vectors from a file or create them on the fly.  Here are several 
#	examples:

19:55                   # list the values from 19 to 55 by ones
c(1,2,3,4)              # concatenate 1, 2, 3, 4, 5 into a vector
rep("red", 5)           # repeat "red" five times
seq(1,10,by=3)          # list the values from 1 to 10 by 3's
seq(1,10,length.out=20) # list 20 evenly spaced elements from 1 to 10
seq(1,10,len=20)        # same thing; arguments of any function can be 
                        # abbreviated as long as the abbreviation is unique

#	These can be combined as well

c(rep("red", 5), rep("white", 5), rep("blue", 5))
rep(c(0,1), 10)

#	And they can be assigned to variables

x <- seq(1,100,by=0.5)
x

code <- c("A", "T", "G", "C")

#	Note that in contrast to many programming languages, vectors in R are 
#	indexed such that the first value is 1 NOT 0.

code[2]
code[0]
code[-1]
code[c(1,2)]

#	Recall that we've already seen vectors in the preamble to this tutorial...

nbases

#	You can easily determine the length of a vector

length(code)
code[length(code)]
length(nbases)

#	Logical operators can also be used on vectors...

tf <- x > 50
isLong <- nbases > 200    # do you know what this is doing?

#	And used to select portions of vectors

x[tf]

#	One of the awesome things about vectors is that you can perform algebraic
#	manipulations on them.  These types of operations are "elementwise" meaning
#	that the operation is applied to each operation

2 * x
x + x
log(x)

#	To define a vector without giving it any values

z <- numeric(5)             #	This creates a numerical vector with 5 zeros
z[3] <- 10
z
z[1:3] <- 5
z
z[10] <- pi

t <- character(5)
t[4] <- "DNA rocks!"


#	You can also create vectors that are indexed by character strings.  In some
#	programming languages these are called hash-maps or look-up tables.

v <- numeric(0)
v["A"] <- 1.23498
v["T"] <- 2.2342
v["C"] <- 3
v["G"] <- 4
v

#	You can get the name of each cell in the vector:

names(v)
names(v) <- NULL  # this removes names attribute
v

names(nbases) <- seqname
nbases[1:10]

#	Alternatively, we could define v2 as

v2 <- c(A=1.23498,T=2.2342,C=3,G=4)
v2
v2 * 2
as.vector(v2)  # strips out names
is.vector(v2)  # checks if v is a vector


#	We can access elements of 'v' by their labels

v2["A"]
v2[["A"]]      # strips the name associated with value 1


#	There are many ways to get a value out of a vector.  We've already seen
#	some of these

v <- floor(runif(10, 1,10))# create a vector with 10 values randomly drawn from 
v                   # the range of 1 to 10
n <- 3
v[n]                # n-th element
v[-n]               # all but the n-th element
v[1:n]              # first n elements
v[-(1:n)]           # elements from n+1 to the end
v[c(2,4)]           # 2-nd and 4-th elements
v["name"]           # element named "name"
v[v > 6]            # elements greater than 6
v[v > 4 & v < 6]    # elements between 4 and 6
v[v %in% c(1,3,7)]  # elements in a given set
v[!is.na(v)]        # subsequence of v consisting of non-missing values of v
v[is.na(v)] <- 0    # sets all missing values to 0


#	There are a variety of operators that take vectors as input

length(v)          # length of vector v
mean(v)            # mean of v
median(v)          # median of v
sd(v)              # standard deviation of v
var(v)             # variance of v
mad(v)             # median absolute deviation of v
min(v)             # min of v
max(v)             # maximal element of v
which.min(v)       # returns the index of the smallest element of v
which.max(v)       # returns the index of the greatest element of v
summary(v)         # return descriptive statistics of v
sum(v)             # sum of elements of v
prod(v)            # product of all elements of v
sort(v)            # ascending sort the values of v
sort(v, decreasing=T)# descending sort the values of v
order(v)           # the order of the sorted values of v
v[order(v)]        # gives the same order as sort(v)
rev(v)             # reverse the order of v
unique(v)          # give the unique values of v
all(v)             # returns TRUE if all values of a logical vector v are TRUE,
                   # otherwise returns FALSE
any(v)             # returns TRUE if at least one value of a logical vector v is
                   # TRUE, otherwise returns FALSE

#	Exercises:  Calculate the following on the data we read in from the seq.sum
#	file and stored as the "nbases" vector...
#	
#		number of sequences


#		number of sequences longer than 200 bp


#		mean seqeunce length


#		median sequence length


#		mean sequence length for sequences longer than 200 bp


#		stanard deviation of sequence length for sequences longer than 200 bp


#		get seq.sum statisitics for the length of sequences



#	Note that if a vector contains missing values then all above numerical 
#	routines are going to return NA value. In order to skip 
#	missing values of 'v' while computing some seq.sum statistics of 'v', one 
#	can set na.rm to TRUE as in the following example


v[11] = NA
mean(v)
mean(v, na.rm=T)


#	Say you want to search a character vector for a feature of the strings.  To
#	do this you can use the grep function and its relatives to get the indices
#	of values in the vector that match the search

shades.of.red <- grep("red", colors())
colors()[shades.of.red]


#	--Tables & Matrices--
#	Tables and matrices are multi-dimensional sets of values of multiple or the
#	same type.  We already read in a table in the preamble of this tutorial...

getwd()          				    # is this the directory we want to be in?
setwd("~/Desktop/IntroToR")         # if not let's change directories
seq.sum <- read.table(file="stool.trim.fasta.summary", header=T)

#	Here "seq.sum" is a table of values describing some characteristics of the
#	sequences after removing the barcodes, primers, and low quality base calls.
#	Tables and matrices are indexed much like vectors...

seq.sum[1,1]             # get the name of the first sequence in the table
seq.sum[1,c(1,4)]        # get the name and length of the first sequence 
seq.sum[1,]              # get all of the data in the first row
seq.sum[,4]              # get all of the data in the column labelled "nbases"
seq.sum[,"nbases"]       # get all of the data in the column labelled "nbases"
seq.sum$nbases           # get all of the data in the column labelled "nbases"
attach(seq.sum);nbases   # get all of the data in the column labelled "nbases"
detach(seq.sum)

#	It should be clear taht the last four commands above gave the same data.
#	With the exception of the first command, each of these commands returns a
#	vector.
#
#	When reading in a table it is always nice to get a handle of what you're
#	working with...

dim(seq.sum)             # number of rows and columns in "seq.sum"
nrow(seq.sum)            # number of rows "seq.sum"
ncol(seq.sum)            # number of columns in "seq.sum"
colnames(seq.sum)        # names of the columns in "seq.sum"
rownames(seq.sum)        # names of the rows in "seq.sum"
seq.sum[1:10,]           # get the first 10 rows of data from the "seq.sum" file

#	Looking at the output from the 'rownames' command we know we can do better
#	than those row names.  Let's use the sequence names as the row names...

rownames(seq.sum)<-seq.sum$seqname
seq.sum<-seq.sum[,-1]
seq.sum[1:10,]


#	Let's look at how we could get the row indices corresponding to sequences
#	longer than 200 bp

(1:length(nbases))[nbases > 200]

#	To get the sequence names we could then do...

rownames(seq.sum)[(1:length(nbases))[nbases > 200]]  #wonky
rownames(seq.sum)[seq.sum$nbases > 200]              #better

#	Earlier we showed how to sort vectors.  Let's sort seqeunce legnth by the
#	length of the longest homopolymer in the sequence...

polymerOrder <- order(seq.sum$polymer)
seq.sum[polymerOrder,c("polymer", "nbases")]

#	You can see that's not exactly what we were hoping for.  We can actually 
#	use the order command to sort on multiple columns...

polymerLengthOrder <- order(seq.sum$polymer, seq.sum$nbases)
polymerLength <- seq.sum[polymerLengthOrder,c("polymer", "nbases")]
polymerLength

#	Let's write the sorted lengths to a new file
write(polymerLength$nbases,file="lengths.txt")

#	To read a vector in from a file...

lengths <- scan("lengths.txt")

#	Those two commands are for writing and reading vectors.  If we wanted to 
#	write and read a table we'd do the following...

write.table(polymerLength,file="polymerLengths.txt", quote=F)

#	If you open polymerLengths.txt in a text editor you will see that there are
#	three columns of data, but only two headings.  If you have one fewer 
#	headings than columns, R will use the first column of data as the rownames 
#	when you try to read it back in...

table<-read.table(file="polymerLengths.txt", header=T)


#	There are some functions that take matrices/tables as input

m <- matrix(floor(runif(100, 1, 10)), nrow=10, ncol=10)	#create a 10 x 10 matrix
t(m)             # transpose the matrix
1/m              # take each value of m and find it's reciprocal
m * m            # calculate the square of each value in m
m %*% m          # performs matrix multiplication
crossprod(m,m)   # performs the cross product
rowSums(m)       # calculate the sum for each row
colSums(m)       # calculate the sum for each column
lower.tri(m)     # find the indices that are below the diagonal
m[lower.tri(m)]  # give the lower triangle of m
diag(m)          # the values on the diagonal of m
det(m)           # the determinent of m

#	If you try to get the mean or standard deviation of a row or column you'll
#	struggle mightilly with these commands.  Instead you need to use the "apply"
#	command

mean(m)
apply(m, 1, mean)	# get the mean for each row (that's the 1)
apply(m, 2, mean)	# get the mean for each column (that's the 2)
apply(m, 1, sum)	# get the sum for each row - same as rowSums(m)
apply(m, 2, sum)	# get the sum for each column - same as colSums(m)


#	--Factors--
#	Factors are categorical variables.  The "seq.sum" table doesn't exactly
#	contain any categorical data.  For the purposes of discussion, let's use
#	the polymer column as a categorical data type.  The important thing is that
#	factors have discrete levels.  For example, in microbial ecology, we might
#	think of soil types, body sites, a person's sex, or whether a site is
#	polluted as categorical variables.  
#	Factors can be created using factor() routine.

seq.sum$polymer <-factor(seq.sum$polymer)
levels(seq.sum$polymer)

#	If we wanted to convert our factors from factors to strings or to numbers
#	we could do the following...
#
#		seq.sum$polymer <- as.character(seq.sum$polymer)
#		seq.sum$polymer <- as.numeric(seq.sum$polymer)
#
#
#	We might be interested to see if sequence length varies with the length
#	of the homopolymer in the sequence.  We can do this with the aggregate
#	command and treating polymer as a factor...

aggregate(seq.sum$nbases, list(polymer), mean)
aggregate(seq.sum$nbases, list(polymer), median)
aggregate(seq.sum$nbases, list(polymer), sd)

#	Similar to the aggregate command, the "by" command will allow you to take
#	all of the columns and perform an operation on the...

by(seq.sum, seq.sum["polymer"], summary)


#===============================================================================
#
#	Programming basics
#
#===============================================================================
#
#	R is a pretty high-level programming language with extensive functionality
#	built in.  The strength of R is that as a user you can add to this to suit
#	your needs
#
#	--Customized functions--
#	Want to make your own function or package?  It's relatively simple.  The
#	general syntax is as follows
#
#		functionName <- function(x){
#			#the stuff goes here
#		}
#
#	So we might write a trivial script to calculate the square root of a value.

my.sqrt<-function(x){
	sqrt(x)
}

my.values <- 1:10
my.sqrt(my.values)

#	The output of "my.sqrt(my.values)" should be the same as the following:

sqrt(1:10)


#	--for loops--
#	If you want to do something to each value in a vector or to carry out some
#	procedure a specific number of times, you can do that with a for loop.
#	Let's sum the square of all the numbers between 1 and 10

for.sum <- 0;
for(i in 1:10){
	for.sum = for.sum + i^2
}
for.sum


#	--If-then-else statements--
#	When you meet a fork in the road, take it.  Well, ok, maybe not, but in R
#	you can use logic statements to make decisions.  For example, let's create
#	a vector of factors to indicate if a sequence is short or long:

lengthCat <- numeric(nrow(seq.sum))

for ( i in 1:length(seq.sum$nbases) ){

  if ( seq.sum$nbases[i] < 150 ){
    lengthCat[i] <- "short"
  }else if ( seq.sum$nbases[i] < 200 ){
    lengthCat[i] <- "medium"
  }else {
    lengthCat[i] <- "long"
  }
}

lengthCat <- factor(lengthCat)


#===============================================================================
#
#	Graphics
#
#===============================================================================
#
#	R has an amazing array of options for graphics.  When someone gives a talk
#	you can tell that they used Excel because the plots look uniformly awful.  
#	In contrast you can tell that someone used R because they look awesome.  We 
#	will just scratch the surface of its graphics capabilities here.  For more 
#	details about these functions remember to use ?<function name>.  For more
#	information about the more complicated and sophisticated graphics libraries
#	consult:
#
#		Murrell, P. (2005) _R Graphics_. Chapman & Hall/CRC Press.
#
#
#	--Histogram--
#	Let's look at the distribution of sequence lengths in our seq.sum table

hist(seq.sum$nbases)

#	Let's gussy it up a bit...

hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
	main="Distribution of Sequence Lengths")
box()

#	The hist command actually returns a value other than the graph...

histData <- hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
	main="Distribution of Sequence Lengths")
histData


#	--Stripcharts--
#	We can also create a strip chart of the sequence lengths for each 
#	homopolymer class...

stripchart(seq.sum$nbases~seq.sum$polymer, ylab="Sequence Length", 
	xlab="Homopolymer Length")

stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter", 
	ylab="Sequence Length", xlab="Homopolymer Length")

stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter", vertical=T, 
	ylab="Sequence Length", xlab="Homopolymer Length")

stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter", vertical=T,
	jitter=0.4, pch=1, cex=.2, col=rainbow(5), ylab="Sequence Length", 
	xlab="Homopolymer Length")


#	--Boxplot--
#	Those strip charts had a lot of data and may be too difficult to interpret.
#	Instead, let's try to represent the data as box plots

boxplot(seq.sum$nbases~seq.sum$polymer, ylab="Sequence Length", 
	xlab="Homopolymer Length")



#	--Pie charts--
#	Pie charts are a generally bad way to represent data.  For a good chuckle
#	check out the "Note" section of...

?pie

polymerCount <- aggregate(seq.sum$polymer, list(polymer), length)
polymerFreq <- polymerCount$x / length(seq.sum$polymer)
pie(polymerFreq)


#	--Bar plots--

barplot(polymerCount$x, names.arg=polymerCount$Group, xlab="Homopolymer length",
	ylab="Number of Sequences")
barplot(polymerCount$x, names.arg=polymerCount$Group, ylab="Homopolymer length",
	xlab="Number of Sequences", horiz=T)



shared<-read.table(file="stool.final.tx.shared", header=T)
rownames(shared)<-shared$V2
shared<-as.matrix(shared[,-c(1,2,3)])
colnames(shared)<-paste("Phylotype", 1:ncol(shared), sep="")
shared.ra<-shared/apply(shared, 1, sum) # calculate the relative abundance of 
										# each phylotype

bar.col<-c(rep("pink", 12), rep("blue", 12))

barplot(shared.ra[,1:5], col=bar.col)
barplot(shared.ra[,1:5], beside=T, col=bar.col)
barplot(t(shared.ra), col=rainbow(200))



#	--Dot/Line graphs--
#	We might be interested in plotting the first two dimensions of a PCoA or 
#	NMDS plot.  Let's do this with data generated in the Costello stool analysis
#	tutorial.  The necessary file is in your folder.

nmds<-read.table(file="stool.final.an.thetayc.0.03.lt.nmds.axes", header=T)
plot(nmds$axis1, nmds$axis2)

#	or

plot(nmds$axis2~nmds$axis1)

#	Looking at the group names in the nmds table we see that the first 12 sample
#	names are from women ("F") and the last 12 are from men ("M").  There are
#	more elegant ways to do this, but for beginners, this will work...

nmds.col<-c(rep("pink", 12), rep("blue", 12))

plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2")
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=1, col=c("pink", "blue"))

plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2", pch=18,
	cex=2)
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=18, cex=1, col=c("pink", 
	"blue"))

#	Although these points aren't linked you could connect them...

plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2", pch=18, 
	cex=2, type="b")
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=18, cex=1, lty=1, 
	col=c("pink", "blue"))

#	You can also overlay two graphs on top of each other using the points
#	command.  Here we'll put the cumulative number of sequences that have that 
#	sequence length or higher.

hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
	main="Distribution of Sequence Lengths", ylim=c(0,length(seq.sum$nbases)))
points(sort(seq.sum$nbases), length(seq.sum$nbases):1, type="l")
box()


#	--Composite graphics--
#	You can make graphics in R using multiple panes.  Let's say we wanted to 
#	create a figure that had two panes representing histograpms for sequences
#	a maximum homopolymer length of 5 and one for the 6's

par(mfrow=c(2,1))
hist(seq.sum[seq.sum$polymer==5,]$nbases, col="skyblue", freq=T, 
	xlab="Length of sequences with a maximum polymer length of 5", main="", 
	xlim=c(0,300), breaks=seq(0,260, 20))
hist(seq.sum[seq.sum$polymer==6,]$nbases, col="red", freq=T,
	xlab="Length of sequences with a maximum polymer length of 6", main="", 
	xlim=c(0,300), breaks=seq(0,260, 20))
par(mfrow=c(1,1))

#===============================================================================
#
#	Libraries
#
#===============================================================================
#
#	One of the amazing aspects of R is that there is a broad and diverse array
#	of scientists engaged in adding features to the program by contributing
#	packages.  You can often find packages you might be interested in by 
#	googling "r <insert type of function>" that you want.  For example, if you'd
#	like to visualize PCoA or NMDS data in 3-D, you will likely stumble up on 
#	the rgl package.  You will need to install and load the package to use the 
#	plot3d command with your data

install.packages("rgl")
library(rgl)

pcoa<-read.table(file="stool.final.an.thetayc.0.03.lt.pcoa.axes", header=T)
rownames(pcoa)<-pcoa$V1
pcoa<-pcoa[,-1]
plot3d(pcoa, col=nmds.col, type="s")

#	Although the functionality is already in mothur, other packages that may
#	interest you include vegan and ecodist

#===============================================================================
#
#	Statistics
#
#===============================================================================
#
#	The R package has a wide array of statistical tools built in and created by
#	other developers.  If you can imagine the test, it is probably already
#	implemented in R

mf <- c(rep("F", 12), rep("M", 12))
t.test(shared.ra[,"Phylotype1"]~mf)
t.test(shared.ra[,"Phylotype5"]~mf)

wilcox.test(shared.ra[,"Phylotype1"]~mf)
wilcox.test(shared.ra[,"Phylotype5"]~mf)

#	Check back for more later.  This section is under development

#===============================================================================
#
#	Lists and data frames
#
#===============================================================================

#	Check back for more later.  This section is under development

#===============================================================================

#===============================================================================
#
#	More resources
#
#===============================================================================
#
#	Websites:
#		RTFM
#		http://www.r-project.org/
#
#		R tips
#		http://pj.freefaculty.org/R/Rtips.html
#	
#		R Graph Gallery
#		http://addictedtor.free.fr/graphiques/RGraphGallery.php
#
#		Quick-R
#		A website for experienced users of popular statistical packages such as
#		SAS, SPSS, Stata, and Systat (although current R users should also find
#		it useful).
#		http://www.statmethods.net/index.html
#
#	Books:
#		Dalgard, P (2008) _Introductory Statistics with R_.  Second edition.  
#		Springer.
#
#		Adler, J (2009) _R in a Nutshell_.  O'Reilly.
#
#		Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S 
#		Language_.  Wadsworth & Brooks/Cole.
#
#		Venables, W. N. and Ripley, B. D. (2002) _Modern Applied Statistics with
#		 S._  Fourth edition.  Springer.
#
#		Murrell, P. (2005) _R Graphics_. Chapman & Hall/CRC Press.
#
#===============================================================================