Skip to content

Commit

Permalink
Merge pull request #2 from stat-ml/package
Browse files Browse the repository at this point in the history
Package
  • Loading branch information
kirill-fedyanin authored Mar 10, 2020
2 parents 6cfd9c0 + 0e2d1e2 commit b4770e3
Show file tree
Hide file tree
Showing 22 changed files with 408 additions and 91 deletions.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,17 @@

Package for active learning and uncertainty quantification in neural nets.

Example experiment script/notebooks with library usage are in `examples` folder.
## Installation

You can install the package via pip
```
pip install alpaca-ml
```

## Code examples
Tutorial are available as example notebooks in `examples` folder.

- [Uncertainty for classification task](examples/classification_uq.ipynb)
- [Uncertainty for regression task](examples/regression_uq.ipynb)
- [Active learning on regression](examples/active_learning.ipynb)
- [Uncertainty visualization](examples/visualization.ipynb)
Empty file.
13 changes: 7 additions & 6 deletions alpaca/dataloader/boston_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'housing.data', URL)
self.df = pd.read_table(data_path, names=self.column_names, header=None, delim_whitespace=True)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}

if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []

self.data = {'train': train, 'val': val}

12 changes: 6 additions & 6 deletions alpaca/dataloader/ccpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ def _build_dataset(self, cache_dir):
self.df = pd.read_excel(file_path)

table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}

if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
11 changes: 5 additions & 6 deletions alpaca/dataloader/concrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'concrete.xls', URL)
self.df = pd.read_excel(data_path)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}

11 changes: 5 additions & 6 deletions alpaca/dataloader/energy_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'energy_efficiencty.xlsx', URL)
self.df = pd.read_excel(data_path)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}

12 changes: 6 additions & 6 deletions alpaca/dataloader/kin8nm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'kin8nm.csv', URL)
self.df = pd.read_csv(data_path)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}

if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
11 changes: 5 additions & 6 deletions alpaca/dataloader/naval_propulsion.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,11 @@ def _build_dataset(self, cache_dir):
file_path = path.join(cache_dir, 'UCI CBM Dataset', 'data.txt')
self.df = pd.read_csv(file_path, delim_whitespace=True, header=None)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
10 changes: 5 additions & 5 deletions alpaca/dataloader/protein_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'CASP.csv', URL)
self.df = pd.read_csv(data_path)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
11 changes: 5 additions & 6 deletions alpaca/dataloader/red_wine.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,11 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'winequality-red.csv', URL)
self.df = pd.read_csv(data_path, sep=';')
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
11 changes: 5 additions & 6 deletions alpaca/dataloader/yacht_hydrodynamics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,11 @@ def _build_dataset(self, cache_dir):
data_path = download(cache_dir, 'yacht_hydrodynamics.data', URL)
self.df = pd.read_csv(data_path, delim_whitespace=True, header=None)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []
self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
12 changes: 6 additions & 6 deletions alpaca/dataloader/year_prediction_msd.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def _build_dataset(self, cache_dir):
file_path = path.join(cache_dir, 'YearPredictionMSD.txt')
self.df = pd.read_csv(file_path, header=None)
table = self.df.to_numpy()
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
self.data = {
'train': train,
'val': val,
'all': np.concatenate((train, val))
}
if self.val_split != 0:
train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
else:
train, val = table, []

self.data = {'train': train, 'val': val}


if __name__ == '__main__':
Expand Down
3 changes: 1 addition & 2 deletions alpaca/model/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ def __init__(self, layers, n_models, reduction='mean', **kwargs):

def fit(self, train_set, val_set, verbose=True, **kwargs):
for i, model in enumerate(self.models):
if verbose:
self._print_fit_status(i+1, self.n_models)
self._print_fit_status(i+1, self.n_models)
model.fit(train_set, val_set, verbose=verbose, **kwargs)

def state_dict(self):
Expand Down
34 changes: 18 additions & 16 deletions alpaca/model/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
import torch.nn as nn
import torch.nn.functional as F

from alpaca.dataloader import loader
from alpaca.dataloader.custom_dataset import loader


class BaseMLP(nn.Module):
def __init__(self, layer_sizes, activation, postprocessing=lambda x: x):
def __init__(self, layer_sizes, activation, postprocessing=lambda x: x, device=None):
super(BaseMLP, self).__init__()

self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device is None:
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
else:
self.device = device

self.layer_sizes = layer_sizes
self.fcs = []
Expand All @@ -23,19 +26,18 @@ def __init__(self, layer_sizes, activation, postprocessing=lambda x: x):
self.double()
self.to(self.device)

def forward(self, x, dropout_rate=0, train=False, dropout_mask=None):
out = torch.DoubleTensor(x).to(self.device) if isinstance(x, np.ndarray) else x
out = self.activation(self.fcs[0](out))
def forward(self, x, dropout_rate=0, dropout_mask=None):
x = self.activation(self.fcs[0](x))

for layer_num, fc in enumerate(self.fcs[1:-1]):
out = self.activation(fc(out))
x = self.activation(fc(x))
if dropout_mask is None:
out = nn.Dropout(dropout_rate)(out)
x = nn.Dropout(dropout_rate)(x)
else:
out = out*dropout_mask(out, dropout_rate, layer_num)
out = self.fcs[-1](out)
out = self.postprocessing(out)
return out if train else out.detach()
x = x*dropout_mask(x, dropout_rate, layer_num)
x = self.fcs[-1](x)
x = self.postprocessing(x)
return x

def fit(
self, train_set, val_set, epochs=10000, verbose=True,
Expand All @@ -53,7 +55,7 @@ def fit(
labels = labels.to(self.device)

# Forward pass
outputs = self(points, train=True, dropout_rate=dropout_rate)
outputs = self(points, dropout_rate=dropout_rate)
loss = self.criterion(outputs, labels)

# Backward and optimize
Expand Down Expand Up @@ -96,15 +98,15 @@ def _print_status(self, epoch, epochs, loss, val_loss):


class MLP(BaseMLP):
def __init__(self, layer_sizes, l2_reg=1e-5, postprocessing=None, loss=nn.MSELoss,
optimizer=None, activation=None):
def __init__(self, layer_sizes, postprocessing=None, loss=nn.MSELoss,
optimizer=None, activation=None, **kwargs):
if postprocessing is None:
postprocessing = lambda x: x

if activation is None:
activation = F.celu

super(MLP, self).__init__(layer_sizes, activation=activation, postprocessing=postprocessing)
super(MLP, self).__init__(layer_sizes, activation=activation, postprocessing=postprocessing, **kwargs)

self.criterion = loss()

Expand Down
5 changes: 3 additions & 2 deletions alpaca/uncertainty_estimator/masks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from dppy.finite_dpps import FiniteDPP


# DEFAULT_MASKS = ['basic_bern', 'decorrelating_sc', 'dpp', 'k_dpp', 'k_dpp_noisereg']
DEFAULT_MASKS = ['mc_dropout', 'decorrelating_sc', 'dpp', 'k_dpp']


Expand Down Expand Up @@ -85,7 +84,7 @@ def reset(self):
self.layer_correlations = {}


ATTEMPTS = 10
ATTEMPTS = 30


class DPPMask:
Expand All @@ -101,6 +100,8 @@ def __call__(self, x, dropout_rate=0.5, layer_num=0):
x_matrix = x.cpu().numpy()

self.x_matrix = x_matrix
micro = 1e-12
x_matrix += np.random.random(x_matrix.shape) * micro # for computational stability
correlations = np.corrcoef(x_matrix.T)
self.dpps[layer_num] = FiniteDPP('likelihood', **{'L': correlations})
self.layer_correlations[layer_num] = correlations
Expand Down
3 changes: 3 additions & 0 deletions alpaca/uncertainty_estimator/mcdue.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import torch
from .masks import build_mask


class MCDUE:
Expand Down Expand Up @@ -30,6 +31,8 @@ def __init__(self, net, nn_runs=25, dropout_rate=.5, dropout_mask=None, keep_run
self.net = net
self.nn_runs = nn_runs
self.dropout_rate = dropout_rate
if isinstance(dropout_mask, str):
dropout_mask = build_mask(dropout_mask)
self.dropout_mask = dropout_mask
self.keep_runs = keep_runs
self._mcd_runs = np.array([])
Expand Down
36 changes: 36 additions & 0 deletions examples/active_learning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit b4770e3

Please sign in to comment.