-
Notifications
You must be signed in to change notification settings - Fork 6
/
ClassificationForest.ecl
162 lines (157 loc) · 8.85 KB
/
ClassificationForest.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/*##############################################################################
## HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®. All rights reserved.
############################################################################## */
IMPORT $ AS LT;
IMPORT LT.LT_Types AS Types;
IMPORT ML_Core;
IMPORT ML_Core.Types as CTypes;
IMPORT ML_Core.interfaces AS Interfaces;
IMPORT LT.internal AS int;
NumericField := CTypes.NumericField;
DiscreteField := CTypes.DiscreteField;
Layout_Model2 := CTypes.Layout_Model2;
TreeNodeDat := Types.TreeNodeDat;
t_Discrete := CTypes.t_Discrete;
t_Work_Item := CTypes.t_Work_Item;
t_RecordID := CTypes.t_RecordId;
ClassProbs := Types.ClassProbs;
IClassify2 := Interfaces.IClassify2;
/**
* Classification using Random Forest algorithm.
* <p>This module implements Random Forest classification as described by
* Breiman, 2001 with extensions.
* (see https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf)
*
* <p>Random Forests provide a very effective method for classification
* with few assumptions about the nature of the data. They are known
* to be one of the best out-of-the-box methods as there are few assumptions
* made regarding the nature of the data or its relationship to classes.
* Random Forests can effectively manage large numbers
* of features, and will automatically choose the most relevant features.
* Random Forests inherently support multi-class problems. Any number of
* class labels can be used.
*
* <p>This implementation supports both Numeric (discrete or continuous) and
* Nominal (unordered categorical values) for the independent (X) features.
* There is therefore, no need to one-hot encode categorical features.
* Nominal features should be identified by including their feature 'number'
* in the set of 'nominalFields' in GetModel.
*
* <p>RegressionForest supports the Myriad interface meaning that multiple
* independent models can be computed with a single call (see ML_Core.Types
* for information on using the Myriad feature).
*
* <p>Notes on use of NumericField and DiscreteField layouts:
* <ul>
* <li>Work-item ids ('wi' field) are not required to be sequential, though they must be positive
* numbers. It is a good practice to assign wi = 1 when only one work-item is used.</li>
* <li>Record Ids ('id' field) are not required to be sequential, though slightly faster performance
* will result if they are sequential (i.e. 1 .. numRecords) for each work-item.</li>
* <li>Feature numbers ('number' field) are not required to be sequential, though slightly faster
* performance will result if they are (i.e. 1 .. numFeatures) for each work-item.</li>
* </ul>
*
* @param numTrees The number of trees to create in the forest for each work-item.
* This defaults to 100, which is adequate for most cases. Increasing
* this parameter generally results in less variance in accuracy between
* runs, at the expense of greater run time.
* @param featuresPerNode The number of features to choose among at each split in
* each tree. This number of features will be chosen at random
* from the full set of features. The default (0) uses the square
* root of the number of features provided, which works well
* for most cases.
* @param maxDepth The deepest to grow any tree in the forest. The default is
* 100, which is adequate for most purposes. Increasing this value
* for very large and complex problems my provide slightly greater
* accuracy at the expense of much greater runtime.
* @param nominalFields An optional set of field 'numbers' that represent Nominal (i.e. unordered,
* categorical) values. Specifying the nominal fields improves run-time
* performance on these fields and may improve accuracy as well. Binary fields
* (fields with only two values) need not be included here as they can be
* considered either ordinal or nominal. The default is to treat all fields as
* ordered.
* Note that this feature should only be used if all of the independent
* data for all work-items use the same record format, and therefore have the same
* set of nominal fields.
* @param balanceClasses An optional Boolean parameter. If true, it indicates that the voting among
* trees should be biased inversely to the frequency of the class for which it
* is voting. This may help in scenarios where there are far more samples of
* certain classes than of others. The default is to not balance (i.e. FALSE).
*/
EXPORT ClassificationForest(UNSIGNED numTrees=100,
UNSIGNED featuresPerNode=0,
UNSIGNED maxDepth=100,
SET OF UNSIGNED nominalFields=[],
BOOLEAN balanceClasses=FALSE) := MODULE(LT.LearningForest(numTrees, featuresPerNode, maxDepth), IClassify2)
/**
* Fit and return a model that maps independent data (X) to its predicted class (Y).
*
* @param independents The set of independent data in NumericField format.
* @param dependents The set of classes in DiscreteField format that correspond to the independent data
* i.e. same 'id'.
* @param nominalFields An optional set of field 'numbers' that represent Nominal (i.e. unordered,
* categorical) values. Specifying the nominal fields improves run-time
* performance on these fields and my improve accuracy as well. Binary fields
* (fields with only two values) need not be listed here as they can be
* considered either ordinal or nominal. Example: [3,5,7].
* @return Model in Layout_Model2 format describing the fitted forest.
* @see ML_Core.Types.NumericField, ML_Core.Types.DiscreteField, ML_Core.Types.Layout_Model2
*/
EXPORT DATASET(Layout_Model2) GetModel(DATASET(NumericField) independents, DATASET(DiscreteField) dependents) := FUNCTION
genX := NF2GenField(independents, nominalFields);
genY := DF2GenField(dependents);
myRF := int.RF_Classification(genX, genY, numTrees, featuresPerNode, maxDepth);
model := myRF.GetModel;
RETURN model;
END;
/**
* Classify a set of data points using a previously fitted model
*
* @param model A model previously returned by GetModel in Layout_Model2 format.
* @param observations The set of independent data to classify in NumericField format.
* @return A DiscreteField dataset that indicates the predicted class of each item
* in observations.
*/
EXPORT DATASET(DiscreteField) Classify(DATASET(Layout_Model2) model, DATASET(NumericField) observations) := FUNCTION
genX := NF2GenField(observations);
myRF := int.RF_Classification();
classes := myRF.Classify(genX, model, balanceClasses);
RETURN classes;
END;
/**
* Calculate the 'probability' that each data point is in each class.
* <p>Probability is approximated by computing the proportion of trees that
* voted for each class for each data point, so should not be treated
* as a reliable measure of true probability.
*
* @param model A model previously returned by GetModel in Layout_Model2 format.
* @param observations The set of independent data to classify in NumericField format.
* @return DATASET(ClassProbs), one record per datapoint (i.e. id) per class
* label. Class labels with zero votes are omitted.
* @see LT_Types.ClassProbs
*
*/
EXPORT DATASET(ClassProbs) GetClassProbs(DATASET(Layout_Model2) model, DATASET(NumericField) observations) := FUNCTION
genX := NF2GenField(observations);
myRF := int.RF_Classification();
probs := myRF.GetClassProbs(genX, model, balanceClasses);
probsS := SORT(probs, wi, id, class); // Global sort
RETURN probsS;
END;
/**
* Extract the set of class weights from the model.
* <p>Classes are weighted inversely proportional to their frequency in the
* training data. <p>Note that the class weights are based on a non-linear
* 'proportion' to avoid excess weight for classes with very low frequency.
* <p>These weights are only used when the 'balanceClasses' option is TRUE.
*
* @param mod A model as returned from GetModel.
* @return DATASET(ClassWeightRec) representing weight for each class label.
* @see LT_Types.ClassWeightRec
*/
EXPORT Model2ClassWeights(DATASET(Layout_Model2) mod) := FUNCTION
myRF := int.RF_Classification();
cw := myRF.Model2ClassWeights(mod);
RETURN cw;
END;
END;