-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathindex.html
1255 lines (1211 loc) · 81.1 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<meta charset="utf-8" />
<meta name="generator" content="quarto-1.4.549" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<meta name="author" content="Alex Sanchez-Pla" />
<meta name="dcterms.date" content="2024-03-28" />
<title>Bioconductor classes for working with microarrays or similar data</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<!-- htmldependencies:E3FAD763 -->
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<div id="quarto-toc-target"></div>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Bioconductor classes for working <br> with microarrays or similar data</h1>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>Alex Sanchez-Pla </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">March 28, 2024</p>
</div>
</div>
</div>
</header>
<nav id="TOC" role="doc-toc">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#introduction" id="toc-introduction"><span class="header-section-number">1</span> Introduction</a>
<ul>
<li><a href="#availability" id="toc-availability"><span class="header-section-number">1.1</span> Availability</a></li>
</ul></li>
<li><a href="#bioconductor-classes-to-manage-micrarray-and-similar-data" id="toc-bioconductor-classes-to-manage-micrarray-and-similar-data"><span class="header-section-number">2</span> Bioconductor classes to manage micrarray and similar data</a>
<ul>
<li><a href="#the-oop-paradigm" id="toc-the-oop-paradigm"><span class="header-section-number">2.1</span> The OOP paradigm</a></li>
<li><a href="#bioconductor-classes" id="toc-bioconductor-classes"><span class="header-section-number">2.2</span> Bioconductor Classes</a></li>
<li><a href="#the-biobase-package" id="toc-the-biobase-package"><span class="header-section-number">2.3</span> The <code>Biobase</code> package</a></li>
<li><a href="#a-toy-dataset" id="toc-a-toy-dataset"><span class="header-section-number">2.4</span> A toy dataset</a></li>
<li><a href="#creating-and-using-objects-of-class-expressionset" id="toc-creating-and-using-objects-of-class-expressionset"><span class="header-section-number">2.5</span> Creating and using objects of class ExpressionSet</a>
<ul>
<li><a href="#slot-assaydata" id="toc-slot-assaydata"><span class="header-section-number">2.5.1</span> Slot <tt>AssayData</tt></a></li>
<li><a href="#information-about-covariates" id="toc-information-about-covariates"><span class="header-section-number">2.5.2</span> Information about covariates</a></li>
<li><a href="#adding-information-about-features" id="toc-adding-information-about-features"><span class="header-section-number">2.5.3</span> Adding information about features</a></li>
<li><a href="#storing-information-about-the-experiment" id="toc-storing-information-about-the-experiment"><span class="header-section-number">2.5.4</span> Storing information about the experiment</a></li>
</ul></li>
<li><a href="#using-objects-of-class-expressionset" id="toc-using-objects-of-class-expressionset"><span class="header-section-number">2.6</span> Using objects of class <tt>ExpressionSet</tt></a>
<ul>
<li><a href="#accessing-slot-values" id="toc-accessing-slot-values"><span class="header-section-number">2.6.1</span> Accessing Slot values</a></li>
<li><a href="#subsetting-expressionsets" id="toc-subsetting-expressionsets"><span class="header-section-number">2.6.2</span> Subsetting <code>ExpressionSets</code></a></li>
</ul></li>
<li><a href="#exercises" id="toc-exercises"><span class="header-section-number">2.7</span> Exercises</a></li>
</ul></li>
<li><a href="#the-geoquery-package-to-download-data-from-geo" id="toc-the-geoquery-package-to-download-data-from-geo"><span class="header-section-number">3</span> The <code>GEOquery</code> package to download data from GEO</a>
<ul>
<li><a href="#downloading-a-dataset-in-gse-format" id="toc-downloading-a-dataset-in-gse-format"><span class="header-section-number">3.1</span> Downloading a dataset in GSE format</a></li>
<li><a href="#downloading-a-dataset-in-gsd-format" id="toc-downloading-a-dataset-in-gsd-format"><span class="header-section-number">3.2</span> Downloading a dataset in GSD format</a></li>
</ul></li>
<li><a href="#exercises-1" id="toc-exercises-1"><span class="header-section-number">4</span> Exercises</a></li>
<li><a href="#references" id="toc-references"><span class="header-section-number">5</span> References</a></li>
<li><a href="#additional-info" id="toc-additional-info"><span class="header-section-number">6</span> Additional info</a></li>
</ul>
</nav>
<section id="introduction" class="level1" data-number="1">
<h1 data-number="1"><span class="header-section-number">1</span> Introduction</h1>
<p>Many omics data, once they have been pre-processed, can be stored as numeric data that can be represented as the typical “data matrix”. This matrix is, however, usually transposed, that is genes (variables) are in rows and samples (individuals) are in columns.</p>
<p>A person who is familiar with statistics and R can therefore explore an omics dataset using standard univariate and multivariate statistical methods.</p>
<p>In practice, omics datasets have more information than just what can be stored in a table. This can be annotation data, multiple covariates other than what is in the column names, or information about th eexperimental design or simply the experiment.</p>
<p>Even for a person who is proficient with software, managing simultaneously distinct objects, that contain related information, can be “tricky” and there is always a danger that the distinct components lose synchronization. For instance removing one sample from the expression matrix requires that the corresponding information is removed or updated in the covariates table. And an error at doing this can yield different problems.</p>
<p>In this lab we introduce the <tt>ExpressionSet</tt> class as an option for managing all these pieces of information simultaneously, which not only simplifies the process, but also prevents mistakes derived from lack of consistency between the parts.</p>
<p>The lab has two parts</p>
<ol type="1">
<li><p>Introduces bioconductor classes to store and access microarray data.</p></li>
<li><p>Shows how to use the <code>GEOquery</code> bioconductor package to download microarray data into an analysis-ready form.</p></li>
</ol>
<section id="availability" class="level2" data-number="1.1">
<h2 data-number="1.1"><span class="header-section-number">1.1</span> Availability</h2>
<p>This document can be re-created using the repository</p>
</section>
</section>
<section id="bioconductor-classes-to-manage-micrarray-and-similar-data" class="level1" data-number="2">
<h1 data-number="2"><span class="header-section-number">2</span> Bioconductor classes to manage micrarray and similar data</h1>
<section id="the-oop-paradigm" class="level2" data-number="2.1">
<h2 data-number="2.1"><span class="header-section-number">2.1</span> The OOP paradigm</h2>
<p>Object-oriented design provides a convenient way to represent data structures and actions performed on them.</p>
<ul>
<li>A <em>class</em> can be tought of as a template, a description of what constitutes each instance of the class.</li>
<li>An <em>instance</em> of a class is a realization of what describes the class.</li>
<li>Attributes of a class are data components, and methods of a class are functions, or actions the instance/class is capable of.</li>
</ul>
<p>The R language has several implementations of the OO paradigm but, in spite of its success in other languages, it is relatively minoritary.</p>
</section>
<section id="bioconductor-classes" class="level2" data-number="2.2">
<h2 data-number="2.2"><span class="header-section-number">2.2</span> Bioconductor Classes</h2>
<p>One case where OOP has succeeded in R or, at least, is more used than in others is in the Bioconductor Project (<a href="http://bioconductor.org">bioconductor.org</a>). In Bioconductor we have to deal with complex data structures such as the results of a microarray experiment, a genome and its annotation or a complex multi-omics dataset. These are situations where using OOP to create classes to manage those complex types of data is clearly appropriate.</p>
</section>
<section id="the-biobase-package" class="level2" data-number="2.3">
<h2 data-number="2.3"><span class="header-section-number">2.3</span> The <code>Biobase</code> package</h2>
<p>The <code>R</code>package{Biobase} package implements one of the best known Bioconductor classes: <tt>ExpressionSet</tt>. It was originally intended to contain microarray data and information on the study that generated them and it has become a standard for similar data structures.</p>
<div class="cell">
<div class="sourceCode" id="cb1"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(Biobase)</span></code></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Loading required package: BiocGenerics</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>
Attaching package: 'BiocGenerics'</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.</code></pre>
</div>
</div>
<p>Figure <span class="citation" data-cites="ref">@ref</span>(ExpressionSet) shows the structure of this class. It is essentially a <em>container</em> that has distinct slots to store some of the most usual components in an omics dataset.</p>
<div class="cell">
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure>
<p><img src="images/Structure-of-Bioconductors-ExpressionSet-class.png" class="img-fluid" width="425" /></p>
<figcaption>Structure of the <tt>ExpressionSet</tt> class, showing its slots and their meaning. Reproduced from Klaus, B., & Reisenauer, S. (2018)</figcaption>
</figure>
</div>
</div>
</div>
<p>The advantage of the OOP approach is that, if a new type of omics data needs a similar but different structure it can be created using inheritance, which means much less work than and better consistency than creating it from scratch.</p>
</section>
<section id="a-toy-dataset" class="level2" data-number="2.4">
<h2 data-number="2.4"><span class="header-section-number">2.4</span> A toy dataset</h2>
<p>For the purpose of this lab we are going to simulate a toy (fake) dataset that consists of the following:</p>
<ul>
<li><p><b>Expression values</b> A matrix of 30 rows and 10 columns containing expression values from a gene expression experiment. Matrix column names are sample identifiers</p></li>
<li><p><b>Covariates</b> A table of ten rows and four columns containing the sample identifiers, the treatment groups and the age and sex of individuals.</p></li>
<li><p><b>Genes</b> Information about the features contained in the data. May be the gene names, the probeset identifiers etc. Usually stored in a character vector but may also be a table with distinct annotations per feature.</p></li>
<li><p><b>Information about the experiment</b> Additional information about the study, such as the authors and their contact details or the title and url of the study that originated them.</p></li>
</ul>
<div class="cell">
<div class="sourceCode" id="cb7"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>expressionValues <span class="ot"><-</span> <span class="fu">matrix</span> (<span class="fu">rnorm</span> (<span class="dv">300</span>), <span class="at">nrow=</span><span class="dv">30</span>)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(expressionValues) <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="st">"sample"</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">10</span>)</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(expressionValues)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> sample1 sample2 sample3 sample4 sample5 sample6
[1,] -1.79178318 0.3647708 1.06828944 0.8071615 0.2694693 2.1486020
[2,] -1.11457291 2.1863533 0.59926802 0.2800048 -0.1337830 0.4469043
[3,] 0.54410546 -0.3089568 -1.39095723 -0.4793362 -0.3394864 1.9156204
[4,] -0.05679919 0.8054591 -0.07347829 1.5515076 0.1411121 -2.4126758
[5,] -0.27897274 0.2667203 0.65244729 0.9092124 0.4477909 0.8531942
[6,] -0.62183797 0.9379370 0.08276914 -0.3436316 0.1730609 2.0550911
sample7 sample8 sample9 sample10
[1,] -1.49156854 -0.5146063 -0.08453896 0.9277924
[2,] -1.57358546 -0.8892892 0.40470823 1.1063466
[3,] -3.14163786 -0.6630793 0.19163676 0.2276276
[4,] 0.02071324 0.6799446 -0.82287280 -0.9756018
[5,] -0.97863706 -2.0371544 -1.20788557 0.4636413
[6,] -0.55963070 0.5038611 1.02490765 -1.8806036</code></pre>
</div>
</div>
<p><strong>VERY IMPORTANT</strong>: To create the ExpressionSet the following has to be verified:</p>
<ul>
<li>The names of the columns of the object that contains the expressions, that will be stored in <code>assayData</code></li>
<li>must match the names of the rows of the object that contains the covariates, that will be stored in <code>phenoData</code>.</li>
</ul>
<p>In this example it is saved in the variable <code>sampleNames</code> but this field will be used as the <em>name of the rows</em>, not as another column</p>
<div class="cell">
<div class="sourceCode" id="cb9"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>targets <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">sampleNames =</span> <span class="fu">paste0</span>(<span class="st">"sample"</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">10</span>),</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="at">group=</span><span class="fu">c</span>(<span class="fu">paste0</span>(<span class="st">"CTL"</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>),<span class="fu">paste0</span>(<span class="st">"TR"</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>)),</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="at">age =</span> <span class="fu">rpois</span>(<span class="dv">10</span>, <span class="dv">30</span>), </span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="at">sex=</span><span class="fu">as.factor</span>(<span class="fu">sample</span>(<span class="fu">c</span>(<span class="st">"Male"</span>, <span class="st">"Female"</span>),<span class="dv">10</span>,<span class="at">replace=</span><span class="cn">TRUE</span>)),</span>
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a> <span class="at">row.names=</span><span class="dv">1</span>)</span>
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(targets, <span class="at">n=</span><span class="dv">10</span>)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> group age sex
sample1 CTL1 29 Female
sample2 CTL2 30 Male
sample3 CTL3 30 Male
sample4 CTL4 23 Female
sample5 CTL5 30 Male
sample6 TR1 38 Male
sample7 TR2 22 Male
sample8 TR3 28 Female
sample9 TR4 32 Male
sample10 TR5 30 Female</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode" id="cb11"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>myGenes <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="st">"gene"</span>,<span class="dv">1</span><span class="sc">:</span><span class="dv">30</span>)</span></code></pre></div>
</div>
<div class="cell">
<div class="sourceCode" id="cb12"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>myInfo<span class="ot">=</span><span class="fu">list</span>(<span class="at">myName=</span><span class="st">"Alex Sanchez"</span>, </span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> <span class="at">myLab=</span><span class="st">"Bioinformatics Lab"</span>,</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="at">myContact=</span><span class="st">"[email protected]"</span>, </span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> <span class="at">myTitle=</span><span class="st">"Practical Exercise on ExpressionSets"</span>)</span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="fu">show</span>(myInfo)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>$myName
[1] "Alex Sanchez"
$myLab
[1] "Bioinformatics Lab"
$myContact
[1] "[email protected]"
$myTitle
[1] "Practical Exercise on ExpressionSets"</code></pre>
</div>
</div>
<p>Having data stored in this way is usually enough for most of the analyes we may want to do. The only unconvenient comes from the fact that the information about the same individuals is in separate R objects so that, for certain applications, we will have to access several objects and <em>assume they are well related</em>.</p>
<p>For example if we want to make a principal components analysis and plot the groups by treatment we need to use both <code>expressionValues" and</code>targets.”</p>
<div class="cell">
<div class="sourceCode" id="cb14"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>pcs <span class="ot"><-</span> <span class="fu">prcomp</span>(expressionValues)</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="fu">names</span>(pcs)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "sdev" "rotation" "center" "scale" "x" </code></pre>
</div>
<div class="sourceCode" id="cb16"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">barplot</span>(pcs<span class="sc">$</span>sdev)</span></code></pre></div>
<div class="cell-output-display">
<div>
<figure>
<p><img src="index_files/figure-html/PCA1-1.png" class="img-fluid" width="672" /></p>
</figure>
</div>
</div>
<div class="sourceCode" id="cb17"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(pcs<span class="sc">$</span>rotation[,<span class="dv">1</span>], pcs<span class="sc">$</span>rotation[,<span class="dv">2</span>], </span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a> <span class="at">main=</span><span class="st">"Representation of first two principal components"</span>)</span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="fu">text</span>(pcs<span class="sc">$</span>rotation[,<span class="dv">1</span>], pcs<span class="sc">$</span>rotation[,<span class="dv">2</span>], targets<span class="sc">$</span>group, <span class="at">cex=</span><span class="fl">0.8</span>, <span class="at">pos=</span><span class="dv">3</span>)</span></code></pre></div>
<div class="cell-output-display">
<div>
<figure>
<p><img src="index_files/figure-html/PCA1-2.png" class="img-fluid" width="672" /></p>
</figure>
</div>
</div>
</div>
<p>Or, if we sort the genes from most to least variable and whant to see which are the top variable genes. We need to use both objects <code>expressionValues" and</code>myGenes” assuming they are well linked:</p>
<div class="cell">
<div class="sourceCode" id="cb18"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>variab <span class="ot"><-</span> <span class="fu">apply</span>(expressionValues, <span class="dv">1</span>, sd)</span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>orderedGenes <span class="ot"><-</span> myGenes[<span class="fu">order</span>(variab, <span class="at">decreasing=</span><span class="cn">TRUE</span>)]</span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(variab[<span class="fu">order</span>(variab, <span class="at">decreasing=</span><span class="cn">TRUE</span>)])</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 1.490262 1.320940 1.314557 1.196881 1.113559 1.098817</code></pre>
</div>
<div class="sourceCode" id="cb20"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(orderedGenes)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "gene23" "gene20" "gene3" "gene1" "gene2" "gene4" </code></pre>
</div>
</div>
<p>Imagine we are informed that individual has to be removed. We have to do it in “expressionValues” and “targets”.</p>
<div class="cell">
<div class="sourceCode" id="cb22"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>newExpress<span class="ot"><-</span> expressionValues[,<span class="sc">-</span><span class="dv">9</span>]</span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>newTargets <span class="ot"><-</span> targets[<span class="sc">-</span><span class="dv">9</span>,]</span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>wrongNewTargets <span class="ot"><-</span> targets [<span class="sc">-</span><span class="dv">10</span>,]</span></code></pre></div>
</div>
<p>It is relatively easy to make an unnoticeable mistake in removing unrelated values from the data matrix and the targets table. If instead of removing individual 9 we remove individual 10 it may be difficult to realize what has happened unless it causes a clear unconsistency!</p>
</section>
<section id="creating-and-using-objects-of-class-expressionset" class="level2" data-number="2.5">
<h2 data-number="2.5"><span class="header-section-number">2.5</span> Creating and using objects of class ExpressionSet</h2>
<p>In order to use a class we need to <em>instantiate</em> it, that is we need to create an object of this class.</p>
<p>This can be done using the generic constructor <tt>new</tt> or with the function <tt>ExpressionSet</tt>.</p>
<p>Both the constructor or the function require a series of parameters which roughly correspond to the slots of the class (type <tt>? ExpressionSet</tt> to see a list of compulsory and optional arguments).</p>
<p>In the following subsections we describe how to create an <tt>ExpressionSet</tt> using the components of the toy dataset. Some of the elements will directly be the element in the toy dataset, such as the expression matrix. For others such as the covariates or the experiment information, specific classes have been introduced so that we have to instantiate these classes first and then use the the objects created to create the <tt>ExpressionSet</tt> object.</p>
<section id="slot-assaydata" class="level3" data-number="2.5.1">
<h3 data-number="2.5.1"><span class="header-section-number">2.5.1</span> Slot <tt>AssayData</tt></h3>
<p>The main element, and indeed the only one to be provided to create an <tt>ExpressionSet</tt>, is <tt>AssayData</tt>. For our practical purposes it can be seen as a matrix with as many rows as genes or generically “features” and as many columns as samples or individuals.</p>
<div class="cell">
<div class="sourceCode" id="cb23"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>myEset <span class="ot"><-</span> <span class="fu">ExpressionSet</span>(expressionValues)</span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(myEset)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "ExpressionSet"
attr(,"package")
[1] "Biobase"</code></pre>
</div>
<div class="sourceCode" id="cb25"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">show</span>(myEset)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>ExpressionSet (storageMode: lockedEnvironment)
assayData: 30 features, 10 samples
element names: exprs
protocolData: none
phenoData: none
featureData: none
experimentData: use 'experimentData(object)'
Annotation: </code></pre>
</div>
</div>
</section>
<section id="information-about-covariates" class="level3" data-number="2.5.2">
<h3 data-number="2.5.2"><span class="header-section-number">2.5.2</span> Information about covariates</h3>
<p>Covariates, such as those contained in the “targets” data frame are not included in the “ExpressionSet” “as.is”. Instead we have first to create an intermediate object of class <tt>AnnotatedDataFrame</tt>.</p>
<p>Class <code>R</code>class{AnnotatedDataFrame} is intended to contain a data frame where we may want to provide enhanced information for columns, i.e. besides the short column names, longer labels to describe them better.</p>
<p>The information about covariates, contained in an instance of class <tt>AnnotatedDataFrame</tt>, is stored in the slot <tt>phenoData</tt>.</p>
<div class="cell">
<div class="sourceCode" id="cb27"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>columnDesc <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">labelDescription=</span> <span class="fu">c</span>(<span class="st">"Treatment/Control"</span>, </span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a> <span class="st">"Age at disease onset"</span>, </span>
<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a> <span class="st">"Sex of patient (Male/Female"</span>))</span>
<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a>myAnnotDF <span class="ot"><-</span> <span class="fu">new</span>(<span class="st">"AnnotatedDataFrame"</span>, <span class="at">data=</span>targets, <span class="at">varMetadata=</span> columnDesc)</span>
<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a><span class="fu">show</span>(myAnnotDF)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>An object of class 'AnnotatedDataFrame'
rowNames: sample1 sample2 ... sample10 (10 total)
varLabels: group age sex
varMetadata: labelDescription</code></pre>
</div>
</div>
<p>Notice that we have not included a label for sample names because this information is not a column of the <code>phenoData</code> object.</p>
<p>Once we have an <tt>AnnotatedDataFrame</tt> we can add it to the <tt>ExpressionSet</tt></p>
<div class="cell">
<div class="sourceCode" id="cb29"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="fu">phenoData</span>(myEset) <span class="ot"><-</span> myAnnotDF</span></code></pre></div>
</div>
<p>Alternatively we could have created the<tt>AnnotatedDataFrame</tt> object first and then create the <tt>ExpressionSet</tt> object with both the expression values and the covariates. In this case it would be required that the expression matrix colum names are the same as the targets row names.</p>
<div class="cell">
<div class="sourceCode" id="cb30"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>myEset <span class="ot"><-</span> <span class="fu">ExpressionSet</span>(<span class="at">assayData=</span>expressionValues, <span class="at">phenoData=</span>myAnnotDF)</span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a><span class="fu">show</span>(myEset)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>ExpressionSet (storageMode: lockedEnvironment)
assayData: 30 features, 10 samples
element names: exprs
protocolData: none
phenoData
sampleNames: sample1 sample2 ... sample10 (10 total)
varLabels: group age sex
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation: </code></pre>
</div>
</div>
</section>
<section id="adding-information-about-features" class="level3" data-number="2.5.3">
<h3 data-number="2.5.3"><span class="header-section-number">2.5.3</span> Adding information about features</h3>
<p>Similarly to what we do to store information about covariates, information about genes (or generically “features”) may be stored in the optional slot <tt>featureData</tt> as an <tt>AnnotatedDataFrame</tt>.</p>
<p>The number of rows in <tt>featureData</tt> must match the number of rows in <tt>assayData.</tt> Row names of <tt>featureData</tt> must match row names of the matrix / matrices in assayData.</p>
<p>This slot is good if one has an annotations table that one wishes to store and manage jointly with the other values. ALternatively we can simple store the names of the features using a character vector in the slot <tt>featureNames</tt>.</p>
<div class="cell">
<div class="sourceCode" id="cb32"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>myEset <span class="ot"><-</span> <span class="fu">ExpressionSet</span>(<span class="at">assayData=</span>expressionValues,</span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a> <span class="at">phenoData=</span>myAnnotDF,</span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a> <span class="at">featureNames =</span>myGenes)</span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a><span class="co"># show(myEset)</span></span></code></pre></div>
</div>
</section>
<section id="storing-information-about-the-experiment" class="level3" data-number="2.5.4">
<h3 data-number="2.5.4"><span class="header-section-number">2.5.4</span> Storing information about the experiment</h3>
<p>In a similar way to what happens with the <tt>AnnotatedDataFrame</tt> class there has been developed a class to store information about the experiment. The structure of the class, called <tt>MIAME</tt> follows the structur of what has been described as the “Minimum Information About a Microarray Experiment” see <a href="https://www.ncbi.nlm.nih.gov/pubmed/11726920">www.ncbi.nlm.nih.gov/pubmed/11726920</a></p>
<p>This is useful information but it is clearly optional for data analysis.</p>
<div class="cell">
<div class="sourceCode" id="cb33"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>myDesc <span class="ot"><-</span> <span class="fu">new</span>(<span class="st">"MIAME"</span>, <span class="at">name=</span> myInfo[[<span class="st">"myName"</span>]],</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a> <span class="at">lab=</span> myInfo[[<span class="st">"myLab"</span>]],</span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a> <span class="at">contact=</span> myInfo[[<span class="st">"myContact"</span>]] ,</span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a> <span class="at">title=</span>myInfo[[<span class="st">"myTitle"</span>]])</span>
<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(myDesc)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Experiment data
Experimenter name: Alex Sanchez
Laboratory: Bioinformatics Lab
Contact information: [email protected]
Title: Practical Exercise on ExpressionSets
URL:
PMIDs:
No abstract available.</code></pre>
</div>
</div>
<p>Again we could add this object to the <tt>ExpressionSet</tt> or use it when creating it from scratch.</p>
<div class="cell">
<div class="sourceCode" id="cb35"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>myEset <span class="ot"><-</span> <span class="fu">ExpressionSet</span>(<span class="at">assayData=</span>expressionValues,</span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a> <span class="at">phenoData=</span>myAnnotDF,</span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a> <span class="at">fetureNames =</span>myGenes,</span>
<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a> <span class="at">experimentData =</span> myDesc)</span>
<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a><span class="co"># show(myEset)</span></span></code></pre></div>
</div>
</section>
</section>
<section id="using-objects-of-class-expressionset" class="level2" data-number="2.6">
<h2 data-number="2.6"><span class="header-section-number">2.6</span> Using objects of class <tt>ExpressionSet</tt></h2>
<p>The advantage of working with <tt>ExpressionSets</tt> lies in the fact that action on the objects are done in such a way that its consistency is ensured. That means for instance that if we subset the <tt>ExpressionSet</tt> it is automatically done on the columns of the expressions and on the rows of the covariates and it is no possible that a distinct row/column are removed.</p>
<p>The following lines illustrate some management of data in an <tt>ExpressionSet</tt>.</p>
<section id="accessing-slot-values" class="level3" data-number="2.6.1">
<h3 data-number="2.6.1"><span class="header-section-number">2.6.1</span> Accessing Slot values</h3>
<p>Notice that to access the values we use special functions called “accessors” instead of the dollar symbol (which would not work for classes) or the @ symbol that does substitute the $ symbol.</p>
<p>Notice also that, in order to access the data frame contained in the <tt>phenoData</tt> slot, which is an <tt>AnnotatedDataFrame</tt>, we need to use two accessors: <tt>phenoData</tt> to access the <tt>ExpressionSet</tt>’s<tt>phenoData</tt> slot and <tt>pData</tt> to access the <tt>data</tt> slot in it. It is strange until you get used to it!</p>
<div class="cell">
<div class="sourceCode" id="cb36"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(<span class="fu">exprs</span>(myEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 30 10</code></pre>
</div>
<div class="sourceCode" id="cb38"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(<span class="fu">phenoData</span>(myEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AnnotatedDataFrame"
attr(,"package")
[1] "Biobase"</code></pre>
</div>
<div class="sourceCode" id="cb40"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(<span class="fu">pData</span>(<span class="fu">phenoData</span>(myEset)))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "data.frame"</code></pre>
</div>
<div class="sourceCode" id="cb42"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">pData</span>(<span class="fu">phenoData</span>(myEset)))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> group age sex
sample1 CTL1 29 Female
sample2 CTL2 30 Male
sample3 CTL3 30 Male
sample4 CTL4 23 Female
sample5 CTL5 30 Male
sample6 TR1 38 Male</code></pre>
</div>
<div class="sourceCode" id="cb44"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">pData</span>(myEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> group age sex
sample1 CTL1 29 Female
sample2 CTL2 30 Male
sample3 CTL3 30 Male
sample4 CTL4 23 Female
sample5 CTL5 30 Male
sample6 TR1 38 Male</code></pre>
</div>
</div>
</section>
<section id="subsetting-expressionsets" class="level3" data-number="2.6.2">
<h3 data-number="2.6.2"><span class="header-section-number">2.6.2</span> Subsetting <code>ExpressionSets</code></h3>
<p>This is where the interest of using <tt>ExpressionSets</tt> is most clearly realized.</p>
<p>The <tt>ExpressionSet</tt> object has been cleverly-designed to make data manipulation consistent with other basic R object types. For example, creating a subset of an ExpressionsSet will subset the expression matrix, sample information and feature annotation (if available) simultaneously in an appropriate manner. The user does not need to know how the object is represented “under-the-hood”. In effect, we can treat the <tt>ExpressionSet</tt> as if it is a standard R data frame</p>
<div class="cell">
<div class="sourceCode" id="cb46"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a>smallEset <span class="ot"><-</span> myEset[<span class="dv">1</span><span class="sc">:</span><span class="dv">15</span>,<span class="fu">c</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>,<span class="dv">6</span><span class="sc">:</span><span class="dv">8</span>)]</span>
<span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(<span class="fu">exprs</span>(smallEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 15 6</code></pre>
</div>
<div class="sourceCode" id="cb48"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(<span class="fu">pData</span>(smallEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 6 3</code></pre>
</div>
<div class="sourceCode" id="cb50"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">pData</span>(smallEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> group age sex
sample1 CTL1 29 Female
sample2 CTL2 30 Male
sample3 CTL3 30 Male
sample6 TR1 38 Male
sample7 TR2 22 Male
sample8 TR3 28 Female</code></pre>
</div>
<div class="sourceCode" id="cb52"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="fu">all</span>(<span class="fu">colnames</span>(<span class="fu">exprs</span>(smallEset))<span class="sc">==</span><span class="fu">rownames</span>(<span class="fu">pData</span>(smallEset)))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] TRUE</code></pre>
</div>
</div>
<p>We can for instance create a new dataset for all individuals younger than 30 or for all females without having to worry about doing it in every component.</p>
<div class="cell">
<div class="sourceCode" id="cb54"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>youngEset <span class="ot"><-</span> myEset[,<span class="fu">pData</span>(myEset)<span class="sc">$</span>age<span class="sc"><</span><span class="dv">30</span>]</span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(<span class="fu">exprs</span>(youngEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 30 4</code></pre>
</div>
<div class="sourceCode" id="cb56"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">pData</span>(youngEset))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> group age sex
sample1 CTL1 29 Female
sample4 CTL4 23 Female
sample7 TR2 22 Male
sample8 TR3 28 Female</code></pre>
</div>
</div>
</section>
</section>
<section id="exercises" class="level2" data-number="2.7">
<h2 data-number="2.7"><span class="header-section-number">2.7</span> Exercises</h2>
<ol start="4" type="1">
<li><p>Create an <code>ExpressionSet</code> object to contain the data for the example study using the data you have downloaded and used in the first section. That is, adapt the steps taken to creat the ExpressionSet with the toy dataset to create one with the data from the study.</p></li>
<li><p>Do some subsetting and check the consistency of the results obtained. For example remove some sample from the covariates slot (the <code>phenoData</code>) and see if it is automatically removed from the expression matrix`.</p></li>
<li><p>Check that you are able to reproduce the analysis in the first part accessing the components of the object created.</p></li>
</ol>
</section>
</section>
<section id="the-geoquery-package-to-download-data-from-geo" class="level1" data-number="3">
<h1 data-number="3"><span class="header-section-number">3</span> The <code>GEOquery</code> package to download data from GEO</h1>
<p>The NCBI Gene Expression Omnibus (GEO) serves as a public repository for a wide range of high-throughput experimental data. These data include single and dual channel microarray-based experiments measuring mRNA, genomic DNA, and protein abundance, as well as non-array techniques such as serial analysis of gene expression (SAGE), mass spectrometry proteomic data, and high-throughput sequencing data.</p>
<p>At the most basic level of organization of GEO, there are four basic entity types. The first three (Sample, Platform, and Series) are supplied by users; the fourth, the dataset, is compiled and curated by GEO staff from the user-submitted data. More information is available in the <a href="https://www.ncbi.nlm.nih.gov/geo/info/overview.html">GEO site</a> and in the document <a href="https://github.com/ASPteaching/Analisis_de_datos_omicos-Ejemplo_0-Microarrays">Analisis_de_datos_omicos-Ejemplo_0-Microarrays</a> available in github.</p>
<p>Data can be downloaded from GEO in a wide variety of formats and using a variety of mechanisms. See the download page in <a href="https://www.ncbi.nlm.nih.gov/geo/info/download.html">this link</a>.</p>
<p>Here we focus on an alternative based on Bioconductor, the <code>GEOquery</code> package (<a href="http://bioconductor.org/packages/release/bioc/html/GEOquery.html" class="uri">http://bioconductor.org/packages/release/bioc/html/GEOquery.html</a>)</p>
<p>This package has been developed <strong>to facilitate downloading data from GEO and turning them into objects of Bioconductor classes such as <code>expressionSets</code></strong></p>
<p>The best way to learn how to use this package is following its <a href="http://bioconductor.org/packages/release/bioc/vignettes/GEOquery/inst/doc/GEOquery.html">vignette, available at the package site</a>.</p>
<p>Here we only describe how to download a datset using either its series (“GSExxx”) or its Dataset (“GDSxxx”) identifier.</p>
<p>In the following lines we illustrate how to get the data for this example using the dataset used in the case study <a href="https://github.com/ASPteaching/Analisis_de_datos_omicos-Ejemplo_0-Microarrays">Analisis_de_datos_omicos-Ejemplo_0-Microarrays</a>, avilable from github.</p>
<p>As can be seen there the the dataset has the following identifiers:</p>
<ul>
<li>Series accesion ID for : GSE27174</li>
<li>Dataset accesion ID for : GDS4155</li>
<li>Plattform accession ID : GPL6246</li>
</ul>
<section id="downloading-a-dataset-in-gse-format" class="level2" data-number="3.1">
<h2 data-number="3.1"><span class="header-section-number">3.1</span> Downloading a dataset in GSE format</h2>
<p>Getting a series dataset from GEO is straightforward. There is only one command that is needed: <code>getGEO</code>.</p>
<p>This function interprets its input (depending on the data format) to determine how to get the data from GEO and then parse the data into useful R data structures.</p>
<div class="cell">
<div class="sourceCode" id="cb58"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> (<span class="sc">!</span><span class="fu">require</span>(GEOquery)) {</span>
<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a> BiocManager<span class="sc">::</span><span class="fu">install</span>(<span class="st">"GEOquery"</span>)</span>
<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a>}</span></code></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Loading required package: GEOquery</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>Setting options('download.file.method.GEOquery'='auto')</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>Setting options('GEOquery.inmemory.gpl'=FALSE)</code></pre>
</div>
<div class="sourceCode" id="cb62"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="fu">require</span>(GEOquery)</span>
<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a>gse <span class="ot"><-</span> <span class="fu">getGEO</span>(<span class="st">"GSE27174"</span>, <span class="at">GSEMatrix=</span><span class="cn">TRUE</span>, <span class="at">AnnotGPL=</span><span class="cn">TRUE</span>)</span></code></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Found 1 file(s)</code></pre>
</div>
<div class="cell-output cell-output-stderr">
<pre><code>GSE27174_series_matrix.txt.gz</code></pre>
</div>
</div>
<p>If the data format required is a “Series” (GSExxxx) the function returns a list, each of which elements is an expressionSet (this is so because sometimes a Series may have several collections of samples).</p>
<div class="cell">
<div class="sourceCode" id="cb65"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(gse)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "list"</code></pre>
</div>
<div class="sourceCode" id="cb67"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="fu">names</span>(gse)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "GSE27174_series_matrix.txt.gz"</code></pre>
</div>
<div class="sourceCode" id="cb69"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a><span class="fu">length</span>(gse)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 1</code></pre>
</div>
<div class="sourceCode" id="cb71"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>gse[[<span class="dv">1</span>]]</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>ExpressionSet (storageMode: lockedEnvironment)
assayData: 35557 features, 8 samples
element names: exprs
protocolData: none
phenoData
sampleNames: GSM671653 GSM671654 ... GSM671660 (8 total)
varLabels: title geo_accession ... strain:ch1 (40 total)
varMetadata: labelDescription
featureData
featureNames: 10338001 10338002 ... 10608724 (35557 total)
fvarLabels: ID Gene title ... GO:Component ID (21 total)
fvarMetadata: Column Description labelDescription
experimentData: use 'experimentData(object)'
pubMedIds: 21725324
Annotation: GPL6246 </code></pre>
</div>
<div class="sourceCode" id="cb73"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a>esetFromGEO <span class="ot"><-</span> gse[[<span class="dv">1</span>]]</span></code></pre></div>
</div>
<p>By creating the expressionSet automatically the slow process of creating the object step by step, as in the previous section, can be avoided.</p>
<p>The expressioSet can now be used as usual:</p>
<div class="cell">
<div class="sourceCode" id="cb74"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb74-1"><a href="#cb74-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">exprs</span>(esetFromGEO))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> GSM671653 GSM671654 GSM671655 GSM671656 GSM671657 GSM671658 GSM671659
10338001 13.12027 12.97898 12.99977 12.93720 13.07715 13.06317 12.87192
10338002 6.47144 6.29206 6.60156 6.09510 6.79910 5.77111 5.73771
10338003 11.50182 11.23240 11.11705 11.03028 11.35657 11.42738 10.91709
10338004 10.37514 10.21853 10.29204 10.17732 10.55388 10.43201 10.07903
10338005 1.78245 1.76433 2.77200 1.95012 2.11798 1.65184 2.10928
10338006 2.72243 2.20203 1.60098 2.70849 3.06379 2.31079 1.93041
GSM671660
10338001 12.91035
10338002 7.02775
10338003 11.09959
10338004 10.43057
10338005 1.70317
10338006 1.78245</code></pre>
</div>
</div>
<p>We can look at the covariates information, but the phenoData object created automatically contains lot of repeated information. Eventually we can explore it and decide which columns we keep and whichs may be removed. For instance we keep the last two columns and see that column 39 contains the information that defines the groups.</p>
<div class="cell">
<div class="sourceCode" id="cb76"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb76-1"><a href="#cb76-1" aria-hidden="true" tabindex="-1"></a><span class="fu">colnames</span>(<span class="fu">pData</span>(esetFromGEO))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "title" "geo_accession"
[3] "status" "submission_date"
[5] "last_update_date" "type"
[7] "channel_count" "source_name_ch1"
[9] "organism_ch1" "characteristics_ch1"
[11] "characteristics_ch1.1" "characteristics_ch1.2"
[13] "characteristics_ch1.3" "characteristics_ch1.4"
[15] "treatment_protocol_ch1" "growth_protocol_ch1"
[17] "molecule_ch1" "extract_protocol_ch1"
[19] "label_ch1" "label_protocol_ch1"
[21] "taxid_ch1" "hyb_protocol"
[23] "scan_protocol" "data_processing"
[25] "platform_id" "contact_name"
[27] "contact_department" "contact_institute"
[29] "contact_address" "contact_city"
[31] "contact_state" "contact_zip/postal_code"
[33] "contact_country" "supplementary_file"
[35] "data_row_count" "cell type:ch1"
[37] "developmental stage:ch1" "genotype/variation:ch1"
[39] "infection status:ch1" "strain:ch1" </code></pre>
</div>
<div class="sourceCode" id="cb78"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb78-1"><a href="#cb78-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pData</span>(esetFromGEO)[,<span class="dv">39</span><span class="sc">:</span><span class="dv">40</span>]</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> infection status:ch1
GSM671653 infected with lentiviruses expressing the three dopaminergic transcription factors Ascl1, Lmx1a and Nurr1
GSM671654 infected with lentiviruses expressing the three dopaminergic transcription factors Ascl1, Lmx1a and Nurr1
GSM671655 infected with lentiviruses expressing the three dopaminergic transcription factors Ascl1, Lmx1a and Nurr1
GSM671656 infected with lentiviruses expressing the three dopaminergic transcription factors Ascl1, Lmx1a and Nurr1
GSM671657 non-infected
GSM671658 non-infected
GSM671659 non-infected
GSM671660 non-infected
strain:ch1
GSM671653 C57BL/6J
GSM671654 C57BL/6J
GSM671655 C57BL/6J
GSM671656 C57BL/6J
GSM671657 C57BL/6J
GSM671658 C57BL/6J
GSM671659 C57BL/6J
GSM671660 C57BL/6J</code></pre>
</div>
</div>
</section>
<section id="downloading-a-dataset-in-gsd-format" class="level2" data-number="3.2">
<h2 data-number="3.2"><span class="header-section-number">3.2</span> Downloading a dataset in GSD format</h2>
<p>Eventually, we may prefer to download the data in GSD format.</p>
<div class="cell">
<div class="sourceCode" id="cb80"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb80-1"><a href="#cb80-1" aria-hidden="true" tabindex="-1"></a>gds <span class="ot"><-</span> <span class="fu">getGEO</span>(<span class="st">"GDS4155"</span>)</span></code></pre></div>
</div>
<p>The object that has been created now is not a list but it is of a special class “GDS”</p>
<div class="cell">
<div class="sourceCode" id="cb81"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb81-1"><a href="#cb81-1" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(gds)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "GDS"
attr(,"package")
[1] "GEOquery"</code></pre>
</div>
<div class="sourceCode" id="cb83"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb83-1"><a href="#cb83-1" aria-hidden="true" tabindex="-1"></a><span class="fu">slotNames</span>(gds)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "gpl" "dataTable" "header" </code></pre>
</div>
</div>
<p>Class ‘GDS’ is comprised of a metadata header (taken nearly verbatim from the SOFT format header) and a GEODataTable. The GEODataTable has two simple parts, a Columns part which describes the column headers on the Table part. There is also a show method (“Meta”) for the class.</p>
<div class="cell">
<div class="sourceCode" id="cb85"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb85-1"><a href="#cb85-1" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(<span class="fu">Meta</span>(gds))</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>$channel_count
[1] "1"
$dataset_id
[1] "GDS4155" "GDS4155"
$description
[1] "Analysis of induced dopaminergic (iDA) neurons generated from E14.5 mouse embryonic fibroblasts (MEFs) reprogrammed by infection with lentiviruses expressing dopaminergic transcription factors Ascl1, Lmx1a and Nurr1. Results provide insight into the molecular basis of MEF to iDA reprogramming."
[2] "dopaminergic-induced"
[3] "control"
$email
[1] "[email protected]"
$feature_count
[1] "35557"
$institute
[1] "NCBI NLM NIH"</code></pre>
</div>
</div>
<p>The gds object can be turned into an expressionSet that contains the same information as in the previous case:</p>
<div class="cell">
<div class="sourceCode" id="cb87"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb87-1"><a href="#cb87-1" aria-hidden="true" tabindex="-1"></a>eset <span class="ot"><-</span> <span class="fu">GDS2eSet</span>(gds,<span class="at">do.log2=</span><span class="cn">FALSE</span>)</span></code></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Using locally cached version of GPL6246 found here:
C:\Users\Usuario\AppData\Local\Temp\RtmpOaCFZR/GPL6246.annot.gz </code></pre>
</div>
<div class="sourceCode" id="cb89"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb89-1"><a href="#cb89-1" aria-hidden="true" tabindex="-1"></a>eset</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>ExpressionSet (storageMode: lockedEnvironment)
assayData: 35557 features, 8 samples
element names: exprs
protocolData: none
phenoData
sampleNames: GSM671653 GSM671654 ... GSM671660 (8 total)
varLabels: sample genotype/variation description
varMetadata: labelDescription
featureData
featureNames: 10344614 10344616 ... 10344613 (35557 total)
fvarLabels: ID Gene title ... GO:Component ID (21 total)
fvarMetadata: Column labelDescription
experimentData: use 'experimentData(object)'
pubMedIds: 21725324
Annotation: </code></pre>
</div>
</div>
</section>
</section>
<section id="exercises-1" class="level1" data-number="4">
<h1 data-number="4"><span class="header-section-number">4</span> Exercises</h1>
<ol type="1">
<li><p>Select a <em>GEO (</em>Gene Expression Omnibus<a href="https://www.ncbi.nlm.nih.gov/geo/">) dataset</a>from the list presented in the “GEOdatasets_enhanced.xls” document available in the resources of the activity.</p></li>
<li><p>Read the data from GEO using the GEOquery package. This will provide you with an expressionSet class object with the normalized data and an additional table with information about the study.</p></li>
<li><p>Determine the structure of the data (rows, columns) and the design of the study (groups of samples or individuals, treatments if any, etc.) that generated them.</p>
<ul>
<li>The information of the experiment can also be downloaded from GEO, either with GEOquery if you provide the dataset identifier GDSxxxx or by accessing the study page.</li>
</ul></li>
</ol>
</section>
<section id="references" class="level1" data-number="5">
<h1 data-number="5"><span class="header-section-number">5</span> References</h1>
<ul>
<li><p>Cui, Dapeng, K. J. Dougherty, DW Machacek, S. Hochman, and D. J Baro. 2006. “Divergence Between Motoneurons: Gene Expression Profiling Provides a Molecular Characterization of Functionally Discrete Somatic and Autonomic Motoneurons.” Physiol Genomics 24 (3): 276–89. https://doi.org/ 10.1152/physiolgenomics.00109.2005.</p></li>
<li><p>Clough, E., & Barrett, T. (2016). The Gene Expression Omnibus Database. In Methods in molecular biology (Clifton, N.J.) (Vol. 1418, pp. 93–110). <a href="https://doi.org/10.1007/978-1-4939-3578-9_5" class="uri">https://doi.org/10.1007/978-1-4939-3578-9_5</a></p></li>
<li><p>Davis, S., & Meltzer, P. (2007). GEOquery: a bridge between the Gene Expression Omnibus (GEO) and BioConductor. Bioinformatics, 14, 1846–1847.</p></li>
<li><p>W. Huber, V.J. Carey, R. Gentleman, …, M. Morgan. Orchestrating high-throughput genomic analysis with Bioconductor. Nature Methods, 2015:12, 115.</p></li>
<li><p>Klaus, B., & Reisenauer, S. (2018). An end to end workflow for differential gene expression using Affymetrix microarrays [version 2; referees: 2 approved]. F1000Research, 5, 1384.</p>
<p><a href="https://doi.org/10.12688/f1000research.8967.2" class="uri">https://doi.org/10.12688/f1000research.8967.2</a></p></li>
</ul>
</section>
<section id="additional-info" class="level1" data-number="6">
<h1 data-number="6"><span class="header-section-number">6</span> Additional info</h1>
<div class="cell">
<div class="sourceCode" id="cb91"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb91-1"><a href="#cb91-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sessionInfo</span>()</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>R version 4.3.0 (2023-04-21 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 22631)
Matrix products: default
locale:
[1] LC_COLLATE=Spanish_Spain.utf8 LC_CTYPE=Spanish_Spain.utf8
[3] LC_MONETARY=Spanish_Spain.utf8 LC_NUMERIC=C
[5] LC_TIME=Spanish_Spain.utf8
time zone: Europe/Madrid
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] GEOquery_2.68.0 Biobase_2.60.0 BiocGenerics_0.46.0
loaded via a namespace (and not attached):
[1] limma_3.56.2 jsonlite_1.8.7 dplyr_1.1.3 compiler_4.3.0
[5] tidyselect_1.2.0 xml2_1.3.5 tidyr_1.3.0 png_0.1-8
[9] yaml_2.3.7 fastmap_1.1.1 readr_2.1.4 R6_2.5.1
[13] generics_0.1.3 curl_5.1.0 knitr_1.45 htmlwidgets_1.6.2
[17] tibble_3.2.1 pillar_1.9.0 tzdb_0.4.0 R.utils_2.12.2
[21] rlang_1.1.1 utf8_1.2.4 xfun_0.39 cli_3.6.1
[25] withr_2.5.1 magrittr_2.0.3 digest_0.6.31 rstudioapi_0.15.0
[29] hms_1.1.3 lifecycle_1.0.3 R.oo_1.25.0 R.methodsS3_1.8.2
[33] vctrs_0.6.4 evaluate_0.22 glue_1.6.2 data.table_1.14.8
[37] fansi_1.0.5 rmarkdown_2.25 purrr_1.0.2 tools_4.3.0
[41] pkgconfig_2.0.3 htmltools_0.5.5 </code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode" id="cb93"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb93-1"><a href="#cb93-1" aria-hidden="true" tabindex="-1"></a><span class="co"># An "index.html" file is created to allow visualitzation in the web using github pages</span></span>
<span id="cb93-2"><a href="#cb93-2" aria-hidden="true" tabindex="-1"></a><span class="fu">file.copy</span>(<span class="at">from=</span><span class="st">"Introduction_2_Bioc_classes_4_tabular_data.html"</span>, <span class="at">to=</span><span class="st">"index.html"</span>, <span class="at">overwrite=</span><span class="cn">TRUE</span>)</span></code></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] FALSE</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode" id="cb95"><pre class="sourceCode r cell-code"><code class="sourceCode r"><span id="cb95-1"><a href="#cb95-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The R code for the document can be extracted from the document with the </span></span>
<span id="cb95-2"><a href="#cb95-2" aria-hidden="true" tabindex="-1"></a><span class="co"># knitr::purl() command</span></span>
<span id="cb95-3"><a href="#cb95-3" aria-hidden="true" tabindex="-1"></a><span class="co"># knitr::purl("Introduction_2_Bioc_classes_4_tabular_data.qmd")</span></span></code></pre></div>
</div>
</section>
</main>
<!-- /main column -->
<script id = "quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const clipboard = new window.ClipboardJS('.code-copy-button', {
text: function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;