-
Notifications
You must be signed in to change notification settings - Fork 9
/
test_outliers.py
97 lines (87 loc) · 3.54 KB
/
test_outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
""" Testing outlier detection
"""
import numpy as np
import outliers
# Only needed if working interactively
reload(outliers)
from nose.tools import assert_equal, assert_true
from numpy.testing import assert_almost_equal
def test_compute_mu_var():
# Test computation of maybe multivariable mean and variance
assert_equal(outliers.compute_mu_var([[1, 1, 1, 1]]), (1, 0))
assert_equal(outliers.compute_mu_var([[-1, 0, 1]]), (0, 1))
# Make a random number generator, seed it to make numbers predictable
rng = np.random.RandomState(42)
vector = rng.normal(3, 7, size=(100,))
mu, var = outliers.compute_mu_var(vector)
assert_almost_equal(mu, vector.mean())
# We used 1 df for the variance estimation
assert_almost_equal(var, vector.var(ddof=1))
# Does it also work for a 2D (row) vector?
mu, var = outliers.compute_mu_var(vector.reshape((1, 100)))
assert_almost_equal(mu, vector.mean())
assert_almost_equal(var, vector.var(ddof=1))
# A list ?
mu, var = outliers.compute_mu_var(vector.tolist())
assert_almost_equal(mu, vector.mean())
assert_almost_equal(var, vector.var(ddof=1))
# 2D matrix
arr2d = rng.normal(3, 7, size=(2, 100))
mu, var = outliers.compute_mu_var(arr2d)
assert_almost_equal(mu, arr2d.mean(axis=1))
demeaned = arr2d - mu[:, None]
est_var = np.dot(demeaned, demeaned.T) / 99
assert_almost_equal(var, est_var)
def test_mahal():
# Test mahalanobis distance
# Basic 1D check - lists as input, 2D row vector list
assert_almost_equal(
outliers.compute_mahal([[-1, 0, 1]], 1, 1), [ 4., 1., 0.])
# Arrays as input, 1D vector
assert_almost_equal(
outliers.compute_mahal(np.array([-1, 0, 1]), np.array(1), np.array(1)),
[ 4., 1., 0.])
# For some random numbers
rng = np.random.RandomState(42)
vector = rng.normal(2, 5, size=(100,))
# Need to pass in variance rather than sigma
distances = outliers.compute_mahal(vector, 2, 5 ** 2)
# Check we get back z score squared
z = (vector - 2) / 5.
assert_almost_equal(distances, z ** 2)
vector = rng.normal(-2, 8, size=(100,))
# Need to pass in variance rather than sigma
distances = outliers.compute_mahal(vector, -2, 8 ** 2)
# Check we get back z score squared for negative mean
z = (vector + 2) / 8.
assert_almost_equal(distances, z ** 2)
# Check for 2D, diagonal variance case
arr2d = np.zeros((3, 100))
# Get expected distances for 2D array with different mus, sigmas
z2s = np.zeros_like(arr2d)
mus = (1, 2, 3)
sigmas = (3, 4, 5)
for i in range(3):
mu = mus[i]
sigma = sigmas[i]
vector = rng.normal(mu, sigma, size=(100,))
z = (vector - mu) / sigma
arr2d[i] = vector
z2s[i] = z ** 2
# Test against mahal routine
distances = outliers.compute_mahal(arr2d, mus, np.diag(sigmas) ** 2)
assert_almost_equal(distances, z2s.mean(axis=0))
def test_estimate_mu_var():
# Test mean, variance estimation routine
# For some random numbers
rng = np.random.RandomState(42)
vector = rng.normal(2, 5, size=(100,))
# If the outliers are a smaller or equal number to the allowed outliers,
# we'll recover the mean for the non-outliers
vector[0:10] = 100
mu, var = outliers.estimate_mu_var(vector, 90)
assert_almost_equal(mu, np.mean(vector[10:]))
assert_almost_equal(var, np.var(vector[10:], ddof=1))
mu, var = outliers.estimate_mu_var(vector, 95)
assert_almost_equal(mu, np.mean(vector[5:]))
assert_almost_equal(var, np.var(vector[5:], ddof=1))