forked from modAL-python/modAL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
expected_error.py
82 lines (62 loc) · 3 KB
/
expected_error.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
Expected error reduction framework for active learning.
"""
from typing import Tuple
import numpy as np
from sklearn.base import clone
from sklearn.exceptions import NotFittedError
from modAL.models import ActiveLearner
from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows, data_shape, add_row
from modAL.utils.selection import multi_argmax, shuffled_argmax
from modAL.uncertainty import _proba_uncertainty, _proba_entropy
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
p_subsample: np.float = 1.0, n_instances: int = 1,
random_tie_break: bool = False) -> np.ndarray:
"""
Expected error reduction query strategy.
References:
Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)
Args:
learner: The ActiveLearner object for which the expected error
is to be estimated.
X: The samples.
loss: The loss function to be used. Can be 'binary' or 'log'.
p_subsample: Probability of keeping a sample from the pool when
calculating expected error. Significantly improves runtime
for large sample pools.
n_instances: The number of instances to be sampled.
random_tie_break: If True, shuffles utility scores to randomize the order. This
can be used to break the tie when the highest utility score is not unique.
Returns:
The indices of the instances from X chosen to be labelled.
"""
assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''
expected_error = np.zeros(shape=(data_shape(X)[0],))
possible_labels = np.unique(learner.y_training)
try:
X_proba = learner.predict_proba(X)
except NotFittedError:
# TODO: implement a proper cold-start
return np.array([0])
cloned_estimator = clone(learner.estimator)
for x_idx, x in enumerate_data(X):
# subsample the data if needed
if np.random.rand() <= p_subsample:
X_reduced = drop_rows(X, x_idx)
# estimate the expected error
for y_idx, y in enumerate(possible_labels):
X_new = add_row(learner.X_training, x)
y_new = data_vstack((learner.y_training, np.array(y).reshape(1,)))
cloned_estimator.fit(X_new, y_new)
refitted_proba = cloned_estimator.predict_proba(X_reduced)
if loss is 'binary':
nloss = _proba_uncertainty(refitted_proba)
elif loss is 'log':
nloss = _proba_entropy(refitted_proba)
expected_error[x_idx] += np.sum(nloss)*X_proba[x_idx, y_idx]
else:
expected_error[x_idx] = np.inf
if not random_tie_break:
return multi_argmax(-expected_error, n_instances)
return shuffled_argmax(-expected_error, n_instances)