-
Notifications
You must be signed in to change notification settings - Fork 2
/
StatisticalLearning.bib
152 lines (145 loc) · 11.1 KB
/
StatisticalLearning.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
@book{james2013introduction,
title={An introduction to statistical learning},
author={James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
volume={112},
year={2013},
publisher={Springer}
}
@misc{amat2017,
author = {Joaquín Amat Rodrigo},
title = {Arboles de decision, Random Forest, Gradient Boosting y C5.0},
url = {https://www.cienciadedatos.net/documentos/33_arboles_de_prediccion_bagging_random_forest_boosting#Introducci%C3%B3n},
year = {2017},
}
@book{Boehmke2020,
author = {Bradley Boehmke and Brandon Greenwell},
isbn = {9781138495685},
journal = {CRC Press},
pages = {634},
title = {The R Series Hands-on Machine Learning with R},
url = {https://www.routledge.com/Hands-On-Machine-Learning-with-R/Boehmke-Greenwell/p/book/9781138495685},
year = {2020},
}
@article{Breiman1996,
abstract = {Bagging predictors is a method for generating multiple versions of a predictor and using these to get an aggregated predictor. The aggregation averages over the versions when predicting a numerical outcome and does a plurality vote when predicting a class. The multiple versions are formed by making bootstrap replicates of the learning set and using these as new learning sets. Tests on real and simulated data sets using classification and regression trees and subset selection in linear regression show that bagging can give substantial gains in accuracy. The vital element is the instability of the prediction method. If perturbing the learning set can cause significant changes in the predictor constructed, then bagging can improve accuracy. © 1996 Kluwer Academic Publishers,.},
author = {Leo Breiman},
doi = {10.1007/BF00058655/METRICS},
issn = {08856125},
issue = {2},
journal = {Machine Learning},
keywords = {Aggregation,Averaging,Bootstrap,Combining},
pages = {123-140},
publisher = {Springer Netherlands},
title = {Bagging predictors},
volume = {24},
url = {https://link.springer.com/article/10.1007/BF00058655},
year = {1996},
}
@article{Chiaretti2004,
author = {Chiaretti, Sabina and Li, Xiaochun and Gentleman, Robert and Vitale, Antonella and Vignetti, Marco and Mandelli, Franco and Ritz, Jerome and Foa, Robin},
title = "{Gene expression profile of adult T-cell acute lymphocytic leukemia identifies distinct subsets of patients with different response to therapy and survival}",
journal = {Blood},
volume = {103},
number = {7},
pages = {2771-2778},
year = {2004},
month = {04},
abstract = "{Gene expression profiles were examined in 33 adult patients with T-cell acute lymphocytic leukemia (T-ALL). Nonspecific filtering criteria identified 313 genes differentially expressed in the leukemic cells. Hierarchical clustering of samples identified 2 groups that reflected the degree of T-cell differentiation but was not associated with clinical outcome. Comparison between refractory patients and those who responded to induction chemotherapy identified a single gene, interleukin 8 (IL-8), that was highly expressed in refractory T-ALL cells and a set of 30 genes that was highly expressed in leukemic cells from patients who achieved complete remission. We next identified 19 genes that were differentially expressed in T-ALL cells from patients who either had a relapse or remained in continuous complete remission. A model based on the expression of 3 of these genes was predictive of duration of remission. The 3-gene model was validated on a further set of T-ALL samples from 18 additional patients treated on the same clinical protocol. This study demonstrates that gene expression profiling can identify a limited number of genes that are predictive of response to induction therapy and remission duration in adult patients with T-ALL. (Blood. 2004;103:2771-2778)}",
issn = {0006-4971},
doi = {10.1182/blood-2003-09-3243},
url = {https://doi.org/10.1182/blood-2003-09-3243},
eprint = {https://ashpublications.org/blood/article-pdf/103/7/2771/1697422/zh800704002771.pdf},
}
@article{Efron79,
author = {B. Efron},
title = {{Bootstrap Methods: Another Look at the Jackknife}},
volume = {7},
journal = {The Annals of Statistics},
number = {1},
publisher = {Institute of Mathematical Statistics},
pages = {1 -- 26},
keywords = {bootstrap, discriminant analysis, error rate estimation, jackknife, Nonlinear regression, nonparametric variance estimation, Resampling, subsample values},
year = {1979},
doi = {10.1214/aos/1176344552},
URL = {https://doi.org/10.1214/aos/1176344552}
}
@article{Hastie2009,
author = {Trevor Hastie and Robert Tibshirani and Jerome Friedman},
city = {New York, NY},
doi = {10.1007/978-0-387-84858-7},
isbn = {978-0-387-84857-0},
publisher = {Springer New York},
title = {The Elements of Statistical Learning},
url = {http://link.springer.com/10.1007/978-0-387-84858-7},
year = {2009},
}
@book{Hastie2016,
author={Hastie, T. and Efron, B.},
title={Computer Age Statistical Inference: Algorithms, Evidence, and Data Science},
year=2016,
publisher={Cambridge University Press}
}
@article{Genuer2020,
author = {Robin Genuer and Jean-Michel Poggi},
city = {Cham},
doi = {10.1007/978-3-030-56485-8},
isbn = {978-3-030-56484-1},
publisher = {Springer International Publishing},
title = {Random Forests with R},
url = {http://link.springer.com/10.1007/978-3-030-56485-8},
year = {2020},
}
@article{Boulesteix2012,
abstract = {The random forest (RF) algorithm by Leo Breiman has become a standard data analysis tool in bioinformatics. It has shown excellent performance in settings where the number of variables is much larger than the number of observations, can cope with complex interaction structures as well as highly correlated variables and return measures of variable importance. This paper synthesizes 10 years of RF development with emphasis on applications to bioinformatics and computational biology. Special attention is paid to practical aspects such as the selection of parameters, available RF implementations, and important pitfalls and biases of RF and its variable importance measures (VIMs). The paper surveys recent developments of themethodology relevant to bioinformatics as well as some representative examples of RF applications in this context and possible directions for future research. © 2012 Wiley Periodicals, Inc.},
author = {Anne Laure Boulesteix and Silke Janitza and Jochen Kruppa and Inke R. König},
doi = {10.1002/WIDM.1072},
issn = {19424795},
issue = {6},
journal = {undefined},
month = {11},
pages = {493-507},
publisher = {Wiley-Blackwell},
title = {Overview of random forest methodology and practical guidance with emphasis on computational biology and bioinformatics},
volume = {2},
year = {2012},
}
@article{Probst2019,
author = {Probst, Philipp and Wright, Marvin N. and Boulesteix, Anne-Laure},
title = {Hyperparameters and tuning strategies for random forest},
journal = {WIREs Data Mining and Knowledge Discovery},
volume = {9},
number = {3},
pages = {e1301},
keywords = {ensemble, literature review, out-of-bag, performance evaluation, ranger, sequential model-based optimization, tuning parameter},
doi = {https://doi.org/10.1002/widm.1301},
url = {https://wires.onlinelibrary.wiley.com/doi/abs/10.1002/widm.1301},
eprint = {https://wires.onlinelibrary.wiley.com/doi/pdf/10.1002/widm.1301},
abstract = {The random forest (RF) algorithm has several hyperparameters that have to be set by the user, for example, the number of observations drawn randomly for each tree and whether they are drawn with or without replacement, the number of variables drawn randomly for each split, the splitting rule, the minimum number of samples that a node must contain, and the number of trees. In this paper, we first provide a literature review on the parameters' influence on the prediction performance and on variable importance measures. It is well known that in most cases RF works reasonably well with the default values of the hyperparameters specified in software packages. Nevertheless, tuning the hyperparameters can improve the performance of RF. In the second part of this paper, after a presenting brief overview of tuning strategies, we demonstrate the application of one of the most established tuning strategies, model-based optimization (MBO). To make it easier to use, we provide the tuneRanger R package that tunes RF with MBO automatically. In a benchmark study on several datasets, we compare the prediction performance and runtime of tuneRanger with other tuning implementations in R and RF with default hyperparameters. This article is categorized under: Algorithmic Development > Biological Data Mining Algorithmic Development > Statistics Algorithmic Development > Hierarchies and Trees Technologies > Machine Learning},
year = {2019}
}
@article{Kuhn2013,
abstract = {Applied Predictive Modeling covers the overall predictive modeling process, beginning with the crucial steps of data preprocessing, data splitting and foundations of model tuning. The text then provides intuitive explanations of numerous common and modern regression and classification techniques, always with an emphasis on illustrating and solving real data problems. The text illustrates all parts of the modeling process through many hands-on, real-life examples, and every chapter contains extensive R code for each step of the process. This multi-purpose text can be used as an introduction to predictive models and the overall modeling process, a practitioner's reference handbook, or as a text for advanced undergraduate or graduate level predictive modeling courses. To that end, each chapter contains problem sets to help solidify the covered concepts and uses data available in the book's R package. This text is intended for a broad audience as both an introduction to predictive models as well as a guide to applying them. Non-mathematical readers will appreciate the intuitive explanations of the techniques while an emphasis on problem-solving with real data across a wide variety of applications will aid practitioners who wish to extend their expertise. Readers should have knowledge of basic statistical ideas, such as correlation and linear regression analysis. While the text is biased against complex equations, a mathematical background is needed for advanced topics.},
author = {Max Kuhn and Kjell Johnson},
doi = {10.1007/978-1-4614-6849-3},
isbn = {9781461468493},
journal = {Applied Predictive Modeling},
month = {1},
pages = {1-600},
publisher = {Springer New York},
title = {Applied predictive modeling},
year = {2013},
}
@inproceedings{Schapire89,
author = {Robert E. Schapire},
title = {The Strength of Weak Learnability (Extended Abstract)},
booktitle = {30th Annual Symposium on Foundations of Computer Science, Research
Triangle Park, North Carolina, USA, 30 October - 1 November 1989},
pages = {28--33},
publisher = {{IEEE} Computer Society},
year = {1989},
url = {https://doi.org/10.1109/SFCS.1989.63451},
doi = {10.1109/SFCS.1989.63451},
timestamp = {Thu, 23 Mar 2023 23:57:54 +0100},
biburl = {https://dblp.org/rec/conf/focs/Schapire89.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}