-
Notifications
You must be signed in to change notification settings - Fork 6
/
LearningForest.ecl
197 lines (193 loc) · 9.55 KB
/
LearningForest.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*##############################################################################
## HPCC SYSTEMS software Copyright (C) 2017 HPCC Systems®. All rights reserved.
############################################################################## */
IMPORT $ AS LT;
IMPORT LT.LT_Types AS Types;
IMPORT ML_Core;
IMPORT ML_Core.Types as CTypes;
IMPORT $ as LT;
IMPORT LT.internal AS int;
GenField := Types.GenField;
DiscreteField := CTypes.DiscreteField;
NumericField := CTypes.NumericField;
Layout_Model2 := CTypes.Layout_Model2;
ModelStats := Types.ModelStats;
TreeNodeDat := Types.TreeNodeDat;
/**
* This is the base module for Random Forests.
* It implements the Random Forest algorithms as described by Breiman, 2001
* (see https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf).
*
* @param numTrees The number of trees to create as the forest for each work-item.
* This defaults to 100, which is adequate for most cases.
* @param featuresPerNode The number of features to choose among at each split in
* each tree. This number of features will be chosen at random
* from the full set of features. The default is the square
* root of the number of features provided, which works well
* for most cases.
* @param maxDepth The deepest to grow any tree in the forest. The default is
* 100, which is adequate for most purposes. Increasing this value
* for very large and complex problems my provide slightly greater
* accuracy at the expense of much greater runtime.
*/
EXPORT LearningForest(UNSIGNED numTrees=100,
UNSIGNED featuresPerNode=0,
UNSIGNED maxDepth=100) := MODULE
// Map a NumericField dataset to GenField dataset
SHARED DATASET(GenField) NF2GenField(DATASET(NumericField) ds, SET OF UNSIGNED nominalFields=[]) := FUNCTION
dsOut := PROJECT(ds, TRANSFORM(GenField, SELF.isOrdinal := LEFT.number NOT IN nominalFields, SELF := LEFT));
RETURN dsOut;
END;
// Map a DiscreteField dataset to GenField dataset
SHARED DATASET(GenField) DF2GenField(DATASET(DiscreteField) ds) := FUNCTION
dsOut := PROJECT(ds, TRANSFORM(GenField, SELF.isOrdinal := TRUE, SELF := LEFT));
RETURN dsOut;
END;
/**
* Get summary statistical information about the model.
*
* @param mod A model previously returned from GetModel.
* @return A single ModelStats record per work-item, containing information about the model
* for that work-item.
* @see LT_Types.ModelStats
*/
EXPORT DATASET(ModelStats) GetModelStats(DATASET(Layout_Model2) mod) := FUNCTION
myRF := int.RF_Base();
RETURN myRF.GetModelStats(mod);
END;
/**
* Extract the set of tree nodes from a model.
*
* @param mod A model as returned from GetModel.
* @return Set of tree nodes representing the fitted forest in
* DATASET(TreeNodeDat) format.
* @see LT_Types.TreeNodeDat
*/
EXPORT DATASET(TreeNodeDat) Model2Nodes(DATASET(Layout_Model2) mod) := FUNCTION
myRF := int.RF_Base();
nodes0 := myRF.Model2Nodes(mod);
nodes := SORT(nodes0, wi, treeId, level, nodeId, LOCAL);
RETURN nodes;
END;
/**
* <p>Determine the relative importance of features in the decision process of
* the model.
* Calculate feature importance using the Mean Decrease Impurity (MDI) method
* from "Understanding Random Forests: by Gilles Loupe (https://arxiv.org/pdf/1407.7502.pdf)
* and due to Breiman [2001, 2002].
*
* <p>Each feature is ranked by:
* <pre> SUM for each branch node in which feature appears (across all trees):
* (impurity_reduction * number of nodes split) / numTrees.</pre>
* @param mod The model to use for ranking of feature importance.
* @return DATASET(FeatureImportanceRec), one per feature per wi.
* @see LT_Types.FeatureImportanceRec
*/
EXPORT FeatureImportance(DATASET(Layout_Model2) mod) := FUNCTION
myRF := int.RF_Base();
fi := myRF.FeatureImportance(mod);
RETURN fi;
END;
SHARED empty_data := DATASET([], NumericField);
/**
* <p>Calculate a matrix of distances between data points in Random Forest Decision Space (RFDS).
* This is an experimental method and may not
* scale to large numbers of data point combinations.
* Two sets of data points X1 and X2 are taken as parameters. A
* Decision Distance will be returned for every point in X1 to every
* point in X2. Therefore, if X1 has N points and X2 has M points, an
* N x M matrix of results will be produced. X2 may be omitted, in which
* case, an N x N matrix will be produced with a Decision Distance for
* every pair of points in X1.
*
* <p>This metric represents a distance measure in the RFDS.
* As such, it provides a continuous measure of distance in a space that is
* highly non-linear and discontinuous relative to the training data.
* Distances in RFDS can be thought of as the number of binary decisions
* that separate two points in the tree. DD, however is a normalized
* metric 0 <= DD < 1 that incorporates the depth of the decision tree.
* It is also averaged over all of the trees in the forest.
* It can possibly be viewed as an approximation of the relative Hamming Distances
* between points.
*
* @param mod The Random Forest model on which to base the distances.
* @param X1 DATASET(NumericField) of "from" points.
* @param X2 (Optional) DATASET(NumericField) of "to" points. If this
* parameter is omitted, the X1 will be used as both "to" and
* "from" points.
* @return DATASET(NumericField) matrix where 'id' is the id of the "from"
* point and 'number' is the id of the "to" point.
* 'value' contains the DD metric between "from" and "to" points.
* Note that if the same point is in X1 and X2, there will be
* redundant metrics, since DD is a symmetric measure (i.e.
* DD(x1, x2) = DD(x2, x1).
*/
EXPORT DecisionDistanceMatrix(DATASET(Layout_Model2) mod, DATASET(NumericField) X1,
DATASET(NumericField) X2=empty_data) := FUNCTION
myRF := int.RF_Base();
ddm := myRF.DecisionDistanceMatrix(mod, NF2GenField(X1), NF2GenField(X2));
RETURN DDM;
END;
/**
*
* Uniqueness Factor is an experimental metric that determines how far a given point
* is (in Random Forest Decision Distance) from a set of other points.
* It may not scale to large numbers of data points.
*
* Uniqueness Factor looks at the Decision Distance from each point to every other
* point in a set.
*
* It is similar to Decision Distance (above), but rather than providing a distance of
* each "from" point to every "to" point, it provides the average distance of each "from"
* point to all of the "to" points.
*
* Like Decision Distance, UF lies on the interval: 0 <= UF < 1.
*
* A high value of UF may indicate an anomolous data point, while a low value may indicate
* "typicalness" of a data point. It may therefore have utility for anomaly detection
* or conversely, for the identification of class prototypes (e.g. the members of a class
* with the lowest UF). In a two-step process one could potentially compute class prototypes
* and then look at the distance of a point from all class prototypes. This could result
* in a way to detect anomalies with respect to e.g., known usage patterns.
*
* @param mod The Random Forest model on which to base the distances.
* @param X1 DATASET(NumericField) of "from" points.
* @param X2 (Optional) DATASET(NumericField) of "to" points. If this
* parameter is omitted, the X1 will be used as both "to" and
* "from" points.
* @return DATASET(NumericField) matrix where 'id' is the id of the "from"
* point and 'value' contains the UF metric for the point.
* I.e. the average DD of the "from" point to all "to" points.
* The 'number' field is not used and is set to 1.
*/
EXPORT UniquenessFactor(DATASET(Layout_Model2) mod, DATASET(NumericField) X1, DATASET(NumericField) X2=empty_data) := FUNCTION
myRF := int.RF_Base();
uf := myRF.UniquenessFactor(mod, NF2GenField(X1), NF2GenField(X2));
RETURN uf;
END;
/**
* Compress and cleanup the model
*
* This function is provided to reduce the size of a model by compressing out
* branches with only one child. These branches are a result of the RF algorithm,
* and do not affect the results of the model.
* This is an expensive operation, which is why it is not done as a matter of
* course. It reduces the size of the model somewhat, and therefore slightly speeds
* up any processing that uses the model, and reduces storage size.
* You may want to compress the model if storage is at a premium, or if the model
* is to be used many times (so that the slight performance gain is multiplied).
* This also makes the model somewhat more readable, and could
* be useful when analyzing the tree or converting it to another system
* (e.g. LUCI) for processing.
*
* @param mod Model as returned from GetModel in Layout_Model2 format.
* @return The Compressed Model.
* @see ML_Core.Types.Layout_Model2
*
*/
EXPORT CompressModel(DATASET(Layout_Model2) mod) := FUNCTION
myRF := int.RF_Base();
cMod := myRF.CompressModel(mod);
RETURN cMod;
END;
END;