-
Notifications
You must be signed in to change notification settings - Fork 1
/
mothur-data-R-tutorial.R
1045 lines (808 loc) · 34.7 KB
/
mothur-data-R-tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(tidyverse)
library(ggplot2)
metadata <- read_excel(path="raw_data/baxter.metadata.xlsx",
col_types=c(sample = "text", fit_result = "numeric", Site = "text", Dx_Bin = "text",
dx = "text", Hx_Prev = "logical", Hx_of_Polyps = "logical", Age = "numeric",
Gender = "text", Smoke = "logical", Diabetic = "logical", Hx_Fam_CRC = "logical",
Height = "numeric", Weight = "numeric", NSAID = "logical", Diabetes_Med = "logical",
stage = "text")
)
metadata[["Height"]] <- na_if(metadata[["Height"]], 0)
metadata[["Weight"]] <- na_if(metadata[["Weight"]], 0)
metadata[["Site"]] <- recode(.x=metadata[["Site"]], "U of Michigan"="U Michigan")
metadata[["Dx_Bin"]] <- recode(.x=metadata[["Dx_Bin"]], "Cancer."="Cancer")
metadata[["Gender"]] <- recode(.x=metadata[["Gender"]], "m"="male")
metadata[["Gender"]] <- recode(.x=metadata[["Gender"]], "f"="female")
metadata <- rename_all(.tbl=metadata, .funs=tolower)
metadata <- rename(.data=metadata,
previous_history=hx_prev,
history_of_polyps=hx_of_polyps,
family_history_of_crc=hx_fam_crc,
diagnosis_bin=dx_bin,
diagnosis=dx,
sex=gender)
rarefy <- read.csv('stability.csv',header = T) %>%
select(-contains("lci-")) %>%
select(-contains("hci-")) %>%
gather(-numSampled,key=sample,value=sobs) %>%
mutate(sample=str_replace_all(sample,pattern = "0.03-",replacement = "")) %>%
drop_na()
rarefy
metadata <- get.metadata()
metadata_rarefy <- inner_join(metadata, rarefy)
ggplot(data=rarefy,aes(x=numSampled,y=sobs,group=sample,color=numSampled))+
geom_line()
#按诊断分组着色
gplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_line()
ggplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_line() +
scale_color_manual(name=NULL,
values=c("blue", "red", "black"),
breaks=c("normal", "adenoma", "cancer"),
labels=c("Normal", "Adenoma", "Cancer")) +
labs(title="Rarefaction curves are pretty pointless at this scale",
x="Number of Sequences Sampled per Subject",
y="Number of OTUs per Subject") +
theme_classic()
#设定坐标范围
ggplot(metadata_rarefy, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_line() +
scale_color_manual(name=NULL,
values=c("blue", "red", "black"),
breaks=c("normal", "adenoma", "cancer"),
labels=c("Normal", "Adenoma", "Cancer")) +
coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
labs(title="Rarefaction curves are pretty pointless at this scale",
x="Number of Sequences Sampled per Subject",
y="Number of OTUs per Subject") +
theme_classic()
#减少示例
set.seed(1) #this makes sure that we all get the same result!
metadata_rarefy_sample <- metadata %>%
sample_n(10) %>%
inner_join(., rarefy)
ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_line() +
scale_color_manual(name=NULL,
values=c("blue", "red", "black"),
breaks=c("normal", "adenoma", "cancer"),
labels=c("Normal", "Adenoma", "Cancer")) +
coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
labs(title="Rarefaction curves are pretty pointless at this scale",
x="Number of Sequences Sampled per Subject",
y="Number of OTUs per Subject") +
theme_classic()
#改变线型
ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_line(linetype=5) +
scale_color_manual(name=NULL,
values=c("blue", "red", "black"),
breaks=c("normal", "adenoma", "cancer"),
labels=c("Normal", "Adenoma", "Cancer")) +
coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
labs(title="Rarefaction curves are pretty pointless at this scale",
x="Number of Sequences Sampled per Subject",
y="Number of OTUs per Subject") +
theme_classic()
#添加竖线
ggplot(metadata_rarefy_sample, aes(x=numsampled, y=sobs, group=sample, color=diagnosis)) +
geom_hline(yintercept=200, color="gray", size=2) +
geom_line() +
scale_color_manual(name=NULL,
values=c("blue", "red", "black"),
breaks=c("normal", "adenoma", "cancer"),
labels=c("Normal", "Adenoma", "Cancer")) +
coord_cartesian(xlim=c(0,20000), ylim=c(0,500)) +
labs(title="Rarefaction curves are pretty pointless at this scale",
x="Number of Sequences Sampled per Subject",
y="Number of OTUs per Subject") +
theme_classic()
#===============================================================================
#
# Introduction to R for use with output from mothur and other tools commonly
# used in computational microbial ecology
#
# Pat Schloss
#
#
# These notes are inspired and loosely built upon a tutorial developed by
# Pawel Gajer ([email protected]) at the University of Maryland Institute for
# Genomic Sciences
#
#
# The purpose of this tutorial is to introduce microbial ecologsist to the R
# statistical programming enviroment. You should create a folder that will
# hold the data files that we will use and creat in the tutorial. I will be
# assuming that you either have never used R or used it in a rather basic
# manner.
#
# At the end you will find a list of resources that you may find useful for
# learning more about R
#
# This tutorial can be run by entering the following command within R...
#
# source(file="tutorial.R", echo=T)
#
#===============================================================================
#
# Introduction to the R environment
#
#===============================================================================
#
# The R environment is a command line where you enter commands. When you
# fire up R, you will see the following:
#
# R version 2.10.0 (2009-10-26)
# Copyright (C) 2009 The R Foundation for Statistical Computing
# ISBN 3-900051-07-0
#
# R is free software and comes with ABSOLUTELY NO WARRANTY.
# You are welcome to redistribute it under certain conditions.
# Type 'license()' or 'licence()' for distribution details.
#
# Natural language support but running in an English locale
#
# R is a collaborative project with many contributors.
# Type 'contributors()' for more information and
# 'citation()' on how to cite R or R packages in publications.
#
# Type 'demo()' for some demos, 'help()' for on-line help, or
# 'help.start()' for an HTML browser interface to help.
# Type 'q()' to quit R.
#
# >
#
# The ">" is the prompt where you enter the commands. I will not be putting
# the prompt in the tutorial. Let's get started by entering the following
2+2
# You will see the following:
#
# > 2+2
# [1] 4
#
# This output tells you the obvious result that 2+2 is 4. R can be used as an
# over-grown calculator, but it is so much more than that. It's really a high
# level programming language to give you publication worthy graphics and
# perform complex statistical analyses.
#
# If you want to quit R, run the following:
#q()
# You will encounter the follwing prompt:
#
# Save workspace image? [y/n/c]:
#
# If you select "y" then the next time you run R, it will recall your previous
# commands.
#y
# Fire up R again and hit the up arrow twice. You will see the following:
#
# > 2+2;
#
# As a short cut you can also type the following to get the same effect when
# quitting R:
#q("yes")
# Something to note as you go through the tutorial is the use of the pound (#)
# symbol. This indicates that what follows is a comment. For example:
2+2 #this is a comment
# Another useful feature is the inline help feature:
?mean
# This will output a help file for the 'mean' function. You can scroll
# through it with the arrow keys as well as the space bar. At the end of the
# help text are one or more examples of how to use the function. You can
# execute this example by copying and pasting the text. Alternatively, you
# can use the 'example' command:
example(plot)
# Not sure what the function is that you want? Use the '??' function...
??pca
# This will tell you the list of functions and packages that have some
# reference to "pca".
#
# Running "2+2" gets you the answer of "4". But what if you want to store
# that value? Save it as a variable...
x <- 2+2
# or
x = 2+2
x
# The left arrow is the preferred method of assigning values to variables.
# Now you can manipulate that variable:
x + 2
# You can run two commands on the same line by separating the commands with a
# semicolon:
x<-2+2;x+2
# To see what variables have been declared:
ls()
# To remove variables you no longer need :
rm(x)
#x
# Combining the last two commands will remove all of the variables stored in
# memory:
rm(list=ls())
ls()
# To keep our examples relevant, let's quickly discuss how to read in a table.
# The frist two commands below are not necessarily needed if you know you are
# in the same folder as your data
getwd() # is this the directory we want to be in?
setwd("~/Desktop/IntroToR") # if not let's change directories
seq.sum <- read.table(file="stool.trim.fasta.summary", header=T)
# The read.table command will read in your data table and keep the column
# headings. These data represent the length of sequece in the Costello stool
# dataset after removing primers, barcodes, and low quality bases. To make
# the data easier to manipulate you can do the following:
colnames(seq.sum) # see the column heading names
seq.sum$nbasesp[1:10] # get the column "nbases" out of the table
attach(seq.sum) # "attach" the table so that you don't need the "$"
nbases[1:10]
# Don't worry if you don't understand everything in the last several lines.
# The important thing is to see that there are different ways to get the same
# output
#
# Ok, that's enough preliminaries for now. Let's get going...
#===============================================================================
#
# Variables
#
#===============================================================================
#
# Technically speaking everything in R is an object. That may be a bit into
# the weeds for most people. But try to keep up. There are many types of
# objects in R. We'll start with these:
# * Numeric values
# * Character values
# * Logical values
# * Vectors
# * Matrices
#
#
# --Numeric values--
# The variable we used above, x, is a numeric value. Here are some other
# examples:
x <- 2/3; x
y <- 31.7
z <- 2.1e7 # same as 2.1*10^7
longAndSilly.variable.name <- pi/3 # an example of a long variable name
# Although it is a matter of personal style, ideally, your variable names
# will have some meaning to you. For example:
genomeSize <- 4.5e6
#16SCopyNumber <- 6 # illegal variable name
rrnCopyNumber <- 6
# A wide variety of arithmetic functions can be applied to these types of
# variables:
log(x) # natural logarithm
log2(x) # base 2 logarithm
log10(x) # base 10 logarithm
exp(x) # exponent
sqrt(x) # square root
abs(x) # absolute value
floor(x) # the largest integers not greater than x
ceiling(x) # the smallest integers not less than x
x %% 2 # remainder or modulo operator
# --Character values--
# Strings of character values are useful for storing text information
A <- "Alanine"; A
R <- "Argenine"; R
N <- "Asparagine"; N
# The paste function is useful for converting numerical information into a
# string
x<-3.14159
text<-paste("x=", x, " y=", y, sep="")
text
# or
text<-paste("x=", format(x, digits=3), " y=", y, sep="")
text
# --Logical values--
# Logical variables can have one of two values - TRUE or FALSE
x <- TRUE
y <- FALSE
!x # NOT operator
x && y # AND operator
x & y # bitwise AND operator (mainly for vectors)
x || y # OR operator
x | y # bitwise OR operator (mainly for vectors)
x == y # is equal operator
x != y # is not equal operator
# We can also perform logical operators on numerical variables:
x <- 5
y <- 3
x > y # greater than operator
x >= y # greater than or equal to operator
x < y # less than operator
x <= y # less than or equal tooperator
x == y # is equal to operator
x != y # is not equal to operator
# We can also convert numerical values to logical valeus
x <- 0
as.logical(x)
as.logical(y)
# We can also use logical operators on strings
x <- "ATG"
y <- "CCC"
x > y # greater than operator
x >= y # greater than or equal to operator
x < y # less than operator
x <= y # less than or equal tooperator
x == y # is equal to operator
x != y # is not equal to operator
# --Vectors--
# Vectors are a one-dimensional set of values of the same type. You can read
# in vectors from a file or create them on the fly. Here are several
# examples:
19:55 # list the values from 19 to 55 by ones
c(1,2,3,4) # concatenate 1, 2, 3, 4, 5 into a vector
rep("red", 5) # repeat "red" five times
seq(1,10,by=3) # list the values from 1 to 10 by 3's
seq(1,10,length.out=20) # list 20 evenly spaced elements from 1 to 10
seq(1,10,len=20) # same thing; arguments of any function can be
# abbreviated as long as the abbreviation is unique
# These can be combined as well
c(rep("red", 5), rep("white", 5), rep("blue", 5))
rep(c(0,1), 10)
# And they can be assigned to variables
x <- seq(1,100,by=0.5)
x
code <- c("A", "T", "G", "C")
# Note that in contrast to many programming languages, vectors in R are
# indexed such that the first value is 1 NOT 0.
code[2]
code[0]
code[-1]
code[c(1,2)]
# Recall that we've already seen vectors in the preamble to this tutorial...
nbases
# You can easily determine the length of a vector
length(code)
code[length(code)]
length(nbases)
# Logical operators can also be used on vectors...
tf <- x > 50
isLong <- nbases > 200 # do you know what this is doing?
# And used to select portions of vectors
x[tf]
# One of the awesome things about vectors is that you can perform algebraic
# manipulations on them. These types of operations are "elementwise" meaning
# that the operation is applied to each operation
2 * x
x + x
log(x)
# To define a vector without giving it any values
z <- numeric(5) # This creates a numerical vector with 5 zeros
z[3] <- 10
z
z[1:3] <- 5
z
z[10] <- pi
t <- character(5)
t[4] <- "DNA rocks!"
# You can also create vectors that are indexed by character strings. In some
# programming languages these are called hash-maps or look-up tables.
v <- numeric(0)
v["A"] <- 1.23498
v["T"] <- 2.2342
v["C"] <- 3
v["G"] <- 4
v
# You can get the name of each cell in the vector:
names(v)
names(v) <- NULL # this removes names attribute
v
names(nbases) <- seqname
nbases[1:10]
# Alternatively, we could define v2 as
v2 <- c(A=1.23498,T=2.2342,C=3,G=4)
v2
v2 * 2
as.vector(v2) # strips out names
is.vector(v2) # checks if v is a vector
# We can access elements of 'v' by their labels
v2["A"]
v2[["A"]] # strips the name associated with value 1
# There are many ways to get a value out of a vector. We've already seen
# some of these
v <- floor(runif(10, 1,10))# create a vector with 10 values randomly drawn from
v # the range of 1 to 10
n <- 3
v[n] # n-th element
v[-n] # all but the n-th element
v[1:n] # first n elements
v[-(1:n)] # elements from n+1 to the end
v[c(2,4)] # 2-nd and 4-th elements
v["name"] # element named "name"
v[v > 6] # elements greater than 6
v[v > 4 & v < 6] # elements between 4 and 6
v[v %in% c(1,3,7)] # elements in a given set
v[!is.na(v)] # subsequence of v consisting of non-missing values of v
v[is.na(v)] <- 0 # sets all missing values to 0
# There are a variety of operators that take vectors as input
length(v) # length of vector v
mean(v) # mean of v
median(v) # median of v
sd(v) # standard deviation of v
var(v) # variance of v
mad(v) # median absolute deviation of v
min(v) # min of v
max(v) # maximal element of v
which.min(v) # returns the index of the smallest element of v
which.max(v) # returns the index of the greatest element of v
summary(v) # return descriptive statistics of v
sum(v) # sum of elements of v
prod(v) # product of all elements of v
sort(v) # ascending sort the values of v
sort(v, decreasing=T)# descending sort the values of v
order(v) # the order of the sorted values of v
v[order(v)] # gives the same order as sort(v)
rev(v) # reverse the order of v
unique(v) # give the unique values of v
all(v) # returns TRUE if all values of a logical vector v are TRUE,
# otherwise returns FALSE
any(v) # returns TRUE if at least one value of a logical vector v is
# TRUE, otherwise returns FALSE
# Exercises: Calculate the following on the data we read in from the seq.sum
# file and stored as the "nbases" vector...
#
# number of sequences
# number of sequences longer than 200 bp
# mean seqeunce length
# median sequence length
# mean sequence length for sequences longer than 200 bp
# stanard deviation of sequence length for sequences longer than 200 bp
# get seq.sum statisitics for the length of sequences
# Note that if a vector contains missing values then all above numerical
# routines are going to return NA value. In order to skip
# missing values of 'v' while computing some seq.sum statistics of 'v', one
# can set na.rm to TRUE as in the following example
v[11] = NA
mean(v)
mean(v, na.rm=T)
# Say you want to search a character vector for a feature of the strings. To
# do this you can use the grep function and its relatives to get the indices
# of values in the vector that match the search
shades.of.red <- grep("red", colors())
colors()[shades.of.red]
# --Tables & Matrices--
# Tables and matrices are multi-dimensional sets of values of multiple or the
# same type. We already read in a table in the preamble of this tutorial...
getwd() # is this the directory we want to be in?
setwd("~/Desktop/IntroToR") # if not let's change directories
seq.sum <- read.table(file="stool.trim.fasta.summary", header=T)
# Here "seq.sum" is a table of values describing some characteristics of the
# sequences after removing the barcodes, primers, and low quality base calls.
# Tables and matrices are indexed much like vectors...
seq.sum[1,1] # get the name of the first sequence in the table
seq.sum[1,c(1,4)] # get the name and length of the first sequence
seq.sum[1,] # get all of the data in the first row
seq.sum[,4] # get all of the data in the column labelled "nbases"
seq.sum[,"nbases"] # get all of the data in the column labelled "nbases"
seq.sum$nbases # get all of the data in the column labelled "nbases"
attach(seq.sum);nbases # get all of the data in the column labelled "nbases"
detach(seq.sum)
# It should be clear taht the last four commands above gave the same data.
# With the exception of the first command, each of these commands returns a
# vector.
#
# When reading in a table it is always nice to get a handle of what you're
# working with...
dim(seq.sum) # number of rows and columns in "seq.sum"
nrow(seq.sum) # number of rows "seq.sum"
ncol(seq.sum) # number of columns in "seq.sum"
colnames(seq.sum) # names of the columns in "seq.sum"
rownames(seq.sum) # names of the rows in "seq.sum"
seq.sum[1:10,] # get the first 10 rows of data from the "seq.sum" file
# Looking at the output from the 'rownames' command we know we can do better
# than those row names. Let's use the sequence names as the row names...
rownames(seq.sum)<-seq.sum$seqname
seq.sum<-seq.sum[,-1]
seq.sum[1:10,]
# Let's look at how we could get the row indices corresponding to sequences
# longer than 200 bp
(1:length(nbases))[nbases > 200]
# To get the sequence names we could then do...
rownames(seq.sum)[(1:length(nbases))[nbases > 200]] #wonky
rownames(seq.sum)[seq.sum$nbases > 200] #better
# Earlier we showed how to sort vectors. Let's sort seqeunce legnth by the
# length of the longest homopolymer in the sequence...
polymerOrder <- order(seq.sum$polymer)
seq.sum[polymerOrder,c("polymer", "nbases")]
# You can see that's not exactly what we were hoping for. We can actually
# use the order command to sort on multiple columns...
polymerLengthOrder <- order(seq.sum$polymer, seq.sum$nbases)
polymerLength <- seq.sum[polymerLengthOrder,c("polymer", "nbases")]
polymerLength
# Let's write the sorted lengths to a new file
write(polymerLength$nbases,file="lengths.txt")
# To read a vector in from a file...
lengths <- scan("lengths.txt")
# Those two commands are for writing and reading vectors. If we wanted to
# write and read a table we'd do the following...
write.table(polymerLength,file="polymerLengths.txt", quote=F)
# If you open polymerLengths.txt in a text editor you will see that there are
# three columns of data, but only two headings. If you have one fewer
# headings than columns, R will use the first column of data as the rownames
# when you try to read it back in...
table<-read.table(file="polymerLengths.txt", header=T)
# There are some functions that take matrices/tables as input
m <- matrix(floor(runif(100, 1, 10)), nrow=10, ncol=10) #create a 10 x 10 matrix
t(m) # transpose the matrix
1/m # take each value of m and find it's reciprocal
m * m # calculate the square of each value in m
m %*% m # performs matrix multiplication
crossprod(m,m) # performs the cross product
rowSums(m) # calculate the sum for each row
colSums(m) # calculate the sum for each column
lower.tri(m) # find the indices that are below the diagonal
m[lower.tri(m)] # give the lower triangle of m
diag(m) # the values on the diagonal of m
det(m) # the determinent of m
# If you try to get the mean or standard deviation of a row or column you'll
# struggle mightilly with these commands. Instead you need to use the "apply"
# command
mean(m)
apply(m, 1, mean) # get the mean for each row (that's the 1)
apply(m, 2, mean) # get the mean for each column (that's the 2)
apply(m, 1, sum) # get the sum for each row - same as rowSums(m)
apply(m, 2, sum) # get the sum for each column - same as colSums(m)
# --Factors--
# Factors are categorical variables. The "seq.sum" table doesn't exactly
# contain any categorical data. For the purposes of discussion, let's use
# the polymer column as a categorical data type. The important thing is that
# factors have discrete levels. For example, in microbial ecology, we might
# think of soil types, body sites, a person's sex, or whether a site is
# polluted as categorical variables.
# Factors can be created using factor() routine.
seq.sum$polymer <-factor(seq.sum$polymer)
levels(seq.sum$polymer)
# If we wanted to convert our factors from factors to strings or to numbers
# we could do the following...
#
# seq.sum$polymer <- as.character(seq.sum$polymer)
# seq.sum$polymer <- as.numeric(seq.sum$polymer)
#
#
# We might be interested to see if sequence length varies with the length
# of the homopolymer in the sequence. We can do this with the aggregate
# command and treating polymer as a factor...
aggregate(seq.sum$nbases, list(polymer), mean)
aggregate(seq.sum$nbases, list(polymer), median)
aggregate(seq.sum$nbases, list(polymer), sd)
# Similar to the aggregate command, the "by" command will allow you to take
# all of the columns and perform an operation on the...
by(seq.sum, seq.sum["polymer"], summary)
#===============================================================================
#
# Programming basics
#
#===============================================================================
#
# R is a pretty high-level programming language with extensive functionality
# built in. The strength of R is that as a user you can add to this to suit
# your needs
#
# --Customized functions--
# Want to make your own function or package? It's relatively simple. The
# general syntax is as follows
#
# functionName <- function(x){
# #the stuff goes here
# }
#
# So we might write a trivial script to calculate the square root of a value.
my.sqrt<-function(x){
sqrt(x)
}
my.values <- 1:10
my.sqrt(my.values)
# The output of "my.sqrt(my.values)" should be the same as the following:
sqrt(1:10)
# --for loops--
# If you want to do something to each value in a vector or to carry out some
# procedure a specific number of times, you can do that with a for loop.
# Let's sum the square of all the numbers between 1 and 10
for.sum <- 0;
for(i in 1:10){
for.sum = for.sum + i^2
}
for.sum
# --If-then-else statements--
# When you meet a fork in the road, take it. Well, ok, maybe not, but in R
# you can use logic statements to make decisions. For example, let's create
# a vector of factors to indicate if a sequence is short or long:
lengthCat <- numeric(nrow(seq.sum))
for ( i in 1:length(seq.sum$nbases) ){
if ( seq.sum$nbases[i] < 150 ){
lengthCat[i] <- "short"
}else if ( seq.sum$nbases[i] < 200 ){
lengthCat[i] <- "medium"
}else {
lengthCat[i] <- "long"
}
}
lengthCat <- factor(lengthCat)
#===============================================================================
#
# Graphics
#
#===============================================================================
#
# R has an amazing array of options for graphics. When someone gives a talk
# you can tell that they used Excel because the plots look uniformly awful.
# In contrast you can tell that someone used R because they look awesome. We
# will just scratch the surface of its graphics capabilities here. For more
# details about these functions remember to use ?<function name>. For more
# information about the more complicated and sophisticated graphics libraries
# consult:
#
# Murrell, P. (2005) _R Graphics_. Chapman & Hall/CRC Press.
#
#
# --Histogram--
# Let's look at the distribution of sequence lengths in our seq.sum table
hist(seq.sum$nbases)
# Let's gussy it up a bit...
hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
main="Distribution of Sequence Lengths")
box()
# The hist command actually returns a value other than the graph...
histData <- hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
main="Distribution of Sequence Lengths")
histData
# --Stripcharts--
# We can also create a strip chart of the sequence lengths for each
# homopolymer class...
stripchart(seq.sum$nbases~seq.sum$polymer, ylab="Sequence Length",
xlab="Homopolymer Length")
stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter",
ylab="Sequence Length", xlab="Homopolymer Length")
stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter", vertical=T,
ylab="Sequence Length", xlab="Homopolymer Length")
stripchart(seq.sum$nbases~seq.sum$polymer, method="jitter", vertical=T,
jitter=0.4, pch=1, cex=.2, col=rainbow(5), ylab="Sequence Length",
xlab="Homopolymer Length")
# --Boxplot--
# Those strip charts had a lot of data and may be too difficult to interpret.
# Instead, let's try to represent the data as box plots
boxplot(seq.sum$nbases~seq.sum$polymer, ylab="Sequence Length",
xlab="Homopolymer Length")
# --Pie charts--
# Pie charts are a generally bad way to represent data. For a good chuckle
# check out the "Note" section of...
?pie
polymerCount <- aggregate(seq.sum$polymer, list(polymer), length)
polymerFreq <- polymerCount$x / length(seq.sum$polymer)
pie(polymerFreq)
# --Bar plots--
barplot(polymerCount$x, names.arg=polymerCount$Group, xlab="Homopolymer length",
ylab="Number of Sequences")
barplot(polymerCount$x, names.arg=polymerCount$Group, ylab="Homopolymer length",
xlab="Number of Sequences", horiz=T)
shared<-read.table(file="stool.final.tx.shared", header=T)
rownames(shared)<-shared$V2
shared<-as.matrix(shared[,-c(1,2,3)])
colnames(shared)<-paste("Phylotype", 1:ncol(shared), sep="")
shared.ra<-shared/apply(shared, 1, sum) # calculate the relative abundance of
# each phylotype
bar.col<-c(rep("pink", 12), rep("blue", 12))
barplot(shared.ra[,1:5], col=bar.col)
barplot(shared.ra[,1:5], beside=T, col=bar.col)
barplot(t(shared.ra), col=rainbow(200))
# --Dot/Line graphs--
# We might be interested in plotting the first two dimensions of a PCoA or
# NMDS plot. Let's do this with data generated in the Costello stool analysis
# tutorial. The necessary file is in your folder.
nmds<-read.table(file="stool.final.an.thetayc.0.03.lt.nmds.axes", header=T)
plot(nmds$axis1, nmds$axis2)
# or
plot(nmds$axis2~nmds$axis1)
# Looking at the group names in the nmds table we see that the first 12 sample
# names are from women ("F") and the last 12 are from men ("M"). There are
# more elegant ways to do this, but for beginners, this will work...
nmds.col<-c(rep("pink", 12), rep("blue", 12))
plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2")
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=1, col=c("pink", "blue"))
plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2", pch=18,
cex=2)
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=18, cex=1, col=c("pink",
"blue"))
# Although these points aren't linked you could connect them...
plot(nmds$axis2~nmds$axis1, col=nmds.col, xlab="Axis 1", ylab="Axis 2", pch=18,
cex=2, type="b")
legend(x=0.3, y=0.6, legend=c("Female", "Male"), pch=18, cex=1, lty=1,
col=c("pink", "blue"))
# You can also overlay two graphs on top of each other using the points
# command. Here we'll put the cumulative number of sequences that have that
# sequence length or higher.
hist(seq.sum$nbases, col="skyblue", freq=T, xlab="Sequence Length",
main="Distribution of Sequence Lengths", ylim=c(0,length(seq.sum$nbases)))
points(sort(seq.sum$nbases), length(seq.sum$nbases):1, type="l")
box()
# --Composite graphics--
# You can make graphics in R using multiple panes. Let's say we wanted to
# create a figure that had two panes representing histograpms for sequences
# a maximum homopolymer length of 5 and one for the 6's
par(mfrow=c(2,1))
hist(seq.sum[seq.sum$polymer==5,]$nbases, col="skyblue", freq=T,
xlab="Length of sequences with a maximum polymer length of 5", main="",
xlim=c(0,300), breaks=seq(0,260, 20))
hist(seq.sum[seq.sum$polymer==6,]$nbases, col="red", freq=T,
xlab="Length of sequences with a maximum polymer length of 6", main="",
xlim=c(0,300), breaks=seq(0,260, 20))
par(mfrow=c(1,1))
#===============================================================================
#
# Libraries
#
#===============================================================================
#
# One of the amazing aspects of R is that there is a broad and diverse array
# of scientists engaged in adding features to the program by contributing
# packages. You can often find packages you might be interested in by
# googling "r <insert type of function>" that you want. For example, if you'd
# like to visualize PCoA or NMDS data in 3-D, you will likely stumble up on
# the rgl package. You will need to install and load the package to use the
# plot3d command with your data
install.packages("rgl")
library(rgl)
pcoa<-read.table(file="stool.final.an.thetayc.0.03.lt.pcoa.axes", header=T)
rownames(pcoa)<-pcoa$V1
pcoa<-pcoa[,-1]
plot3d(pcoa, col=nmds.col, type="s")
# Although the functionality is already in mothur, other packages that may
# interest you include vegan and ecodist
#===============================================================================
#
# Statistics
#
#===============================================================================
#
# The R package has a wide array of statistical tools built in and created by
# other developers. If you can imagine the test, it is probably already
# implemented in R
mf <- c(rep("F", 12), rep("M", 12))
t.test(shared.ra[,"Phylotype1"]~mf)
t.test(shared.ra[,"Phylotype5"]~mf)
wilcox.test(shared.ra[,"Phylotype1"]~mf)
wilcox.test(shared.ra[,"Phylotype5"]~mf)
# Check back for more later. This section is under development
#===============================================================================
#