-
Notifications
You must be signed in to change notification settings - Fork 0
/
zombies.py
119 lines (94 loc) · 3.31 KB
/
zombies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from warnings import warn
import joblib
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
def confidence_interval(
statfunc, data, *,
n_boot=1000, seed=None, confidence=0.95, pool=1, silent=False,
):
stat = statfunc(data)
bootstats = bootstrap_statistics(
n_boot, seed, pool=pool, silent=silent
)(statfunc, data)
jackstats = jackknife_statistics(
pool=pool, silent=silent,
)(statfunc, data)
cis = np.column_stack([
bca_ci(confidence)(s, b, j)
for s, b, j in zip(np.atleast_1d(stat), bootstats.T, jackstats.T)
])
return same_type_as(stat)(cis)
def bca_ci(confidence):
def _(stat, bootstats, jackstats):
if np.var(jackstats) == 0:
warn("A stat had no variance", category=RuntimeWarning)
return np.array([bootstats[0]] * 2)
quantiles = bca_quantiles(confidence)(
bias=bias(stat, bootstats),
acceleration=acceleration(jackstats),
)
return np.quantile(bootstats, quantiles)
return _
def bootstrap_statistics(n_boot, seed, pool=1, silent=False):
def _(statfunc, data):
seeds_ = seeds(n_boot, seed)
stats = joblib.Parallel(n_jobs=pool)(
joblib.delayed(bootstrap_statistic(statfunc, data))(seed)
for seed in tqdm(seeds_, desc="Bootstrapping", disable=silent)
)
return np.vstack(stats)
return _
def bootstrap_statistic(statfunc, data):
def _(seed):
rng = np.random.RandomState(seed)
bootidx = rng.randint(len(data), size=len(data))
boot_data = indexable(data)[bootidx]
return statfunc(boot_data)
return _
def jackknife_statistics(pool=1, silent=False):
def _(statfunc, data):
lo_idxs = range(len(data))
stats = joblib.Parallel(n_jobs=pool)(
joblib.delayed(jackknife_statistic(statfunc, data))(lo_idx)
for lo_idx in tqdm(lo_idxs, desc="Jackknifing", disable=silent)
)
return np.vstack(stats)
return _
def jackknife_statistic(statfunc, data):
def _(lo_idx):
jack_idx = np.r_[0:lo_idx, (lo_idx + 1):len(data)]
jack_data = indexable(data)[jack_idx]
return statfunc(jack_data)
return _
def bca_quantiles(confidence):
def _(bias, acceleration):
alpha = 1 - confidence
quantiles_orig = np.array([alpha / 2, 1 - alpha])
z_orig = stats.norm.ppf(quantiles_orig)
numer = z_orig[:, None] + bias
denom = 1 - acceleration * numer
z_new = bias + (numer / denom)
return stats.norm.cdf(z_new)
return _
def bias(stat, bootstats):
return stats.norm.ppf(np.mean(bootstats < stat, axis=0))
def acceleration(jackstats):
deviations = np.mean(jackstats) - jackstats
numer = np.sum(deviations) ** 3
denom = 6 * np.sum(deviations ** 2) ** 1.5
return numer / denom
def seeds(n_boot, seed):
rng = np.random.RandomState(seed)
return rng.randint(np.iinfo(np.int32).max, size=n_boot)
def indexable(data):
if isinstance(data, (pd.DataFrame, pd.Series)):
return data.iloc
return data
def same_type_as(template):
def _(x):
if isinstance(template, pd.Series):
return pd.DataFrame(x, columns=template.index)
return x
return _