Merge pull request #2 from stat-ml/package

Package
stat-ml · Mar 10, 2020 · b4770e3 · b4770e3
2 parents 6cfd9c0 + 0e2d1e2
commit b4770e3
Show file tree

Hide file tree

Showing 22 changed files with 408 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -3,4 +3,17 @@
 
 Package for active learning and uncertainty quantification in neural nets.
 
-Example experiment script/notebooks with library usage are in `examples` folder.
+## Installation
+
+You can install the package via pip
+```
+pip install alpaca-ml
+```
+
+## Code examples
+Tutorial are available as example notebooks in `examples` folder.
+
+- [Uncertainty for classification task](examples/classification_uq.ipynb)
+- [Uncertainty for regression task](examples/regression_uq.ipynb)
+- [Active learning on regression](examples/active_learning.ipynb)
+- [Uncertainty visualization](examples/visualization.ipynb)
diff --git a/alpaca/active_learning/__init__.py b/alpaca/active_learning/__init__.py
diff --git a/alpaca/dataloader/boston_housing.py b/alpaca/dataloader/boston_housing.py
@@ -36,10 +36,11 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'housing.data', URL)
         self.df = pd.read_table(data_path, names=self.column_names, header=None, delim_whitespace=True)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+
+        self.data = {'train': train, 'val': val}
 
diff --git a/alpaca/dataloader/ccpp.py b/alpaca/dataloader/ccpp.py
@@ -39,12 +39,12 @@ def _build_dataset(self, cache_dir):
         self.df = pd.read_excel(file_path)
 
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/concrete.py b/alpaca/dataloader/concrete.py
@@ -34,10 +34,9 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'concrete.xls', URL)
         self.df = pd.read_excel(data_path)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
diff --git a/alpaca/dataloader/energy_efficiency.py b/alpaca/dataloader/energy_efficiency.py
@@ -36,10 +36,9 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'energy_efficiencty.xlsx', URL)
         self.df = pd.read_excel(data_path)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
diff --git a/alpaca/dataloader/kin8nm.py b/alpaca/dataloader/kin8nm.py
@@ -34,12 +34,12 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'kin8nm.csv', URL)
         self.df = pd.read_csv(data_path)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/naval_propulsion.py b/alpaca/dataloader/naval_propulsion.py
@@ -38,12 +38,11 @@ def _build_dataset(self, cache_dir):
         file_path = path.join(cache_dir, 'UCI CBM Dataset', 'data.txt')
         self.df = pd.read_csv(file_path, delim_whitespace=True, header=None)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/protein_structure.py b/alpaca/dataloader/protein_structure.py
@@ -33,11 +33,11 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'CASP.csv', URL)
         self.df = pd.read_csv(data_path)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/red_wine.py b/alpaca/dataloader/red_wine.py
@@ -34,12 +34,11 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'winequality-red.csv', URL)
         self.df = pd.read_csv(data_path, sep=';')
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/yacht_hydrodynamics.py b/alpaca/dataloader/yacht_hydrodynamics.py
@@ -34,12 +34,11 @@ def _build_dataset(self, cache_dir):
         data_path = download(cache_dir, 'yacht_hydrodynamics.data', URL)
         self.df = pd.read_csv(data_path, delim_whitespace=True, header=None)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/dataloader/year_prediction_msd.py b/alpaca/dataloader/year_prediction_msd.py
@@ -38,12 +38,12 @@ def _build_dataset(self, cache_dir):
         file_path = path.join(cache_dir, 'YearPredictionMSD.txt')
         self.df = pd.read_csv(file_path, header=None)
         table = self.df.to_numpy()
-        train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
-        self.data = {
-            'train': train,
-            'val': val,
-            'all': np.concatenate((train, val))
-        }
+        if self.val_split != 0:
+            train, val = train_test_split(table, test_size=self.val_split, shuffle=True)
+        else:
+            train, val = table, []
+
+        self.data = {'train': train, 'val': val}
 
 
 if __name__ == '__main__':

diff --git a/alpaca/model/ensemble.py b/alpaca/model/ensemble.py
@@ -12,8 +12,7 @@ def __init__(self, layers, n_models, reduction='mean', **kwargs):
 
     def fit(self, train_set, val_set, verbose=True, **kwargs):
         for i, model in enumerate(self.models): 
-            if verbose:
-                self._print_fit_status(i+1, self.n_models)
+            self._print_fit_status(i+1, self.n_models)
             model.fit(train_set, val_set, verbose=verbose, **kwargs)
 
     def state_dict(self):

diff --git a/alpaca/model/mlp.py b/alpaca/model/mlp.py
@@ -3,14 +3,17 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from alpaca.dataloader import loader
+from alpaca.dataloader.custom_dataset import loader
 
 
 class BaseMLP(nn.Module):
-    def __init__(self, layer_sizes, activation, postprocessing=lambda x: x):
+    def __init__(self, layer_sizes, activation, postprocessing=lambda x: x, device=None):
         super(BaseMLP, self).__init__()
 
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if device is None:
+            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        else:
+            self.device = device
 
         self.layer_sizes = layer_sizes
         self.fcs = []
@@ -23,19 +26,18 @@ def __init__(self, layer_sizes, activation, postprocessing=lambda x: x):
         self.double()
         self.to(self.device)
 
-    def forward(self, x, dropout_rate=0, train=False, dropout_mask=None):
-        out = torch.DoubleTensor(x).to(self.device) if isinstance(x, np.ndarray) else x
-        out = self.activation(self.fcs[0](out))
+    def forward(self, x, dropout_rate=0, dropout_mask=None):
+        x = self.activation(self.fcs[0](x))
 
         for layer_num, fc in enumerate(self.fcs[1:-1]):
-            out = self.activation(fc(out))
+            x = self.activation(fc(x))
             if dropout_mask is None:
-                out = nn.Dropout(dropout_rate)(out)
+                x = nn.Dropout(dropout_rate)(x)
             else:
-                out = out*dropout_mask(out, dropout_rate, layer_num)
-        out = self.fcs[-1](out)
-        out = self.postprocessing(out)
-        return out if train else out.detach()
+                x = x*dropout_mask(x, dropout_rate, layer_num)
+        x = self.fcs[-1](x)
+        x = self.postprocessing(x)
+        return x
 
     def fit(
             self, train_set, val_set, epochs=10000, verbose=True,
@@ -53,7 +55,7 @@ def fit(
                 labels = labels.to(self.device)
 
                 # Forward pass
-                outputs = self(points, train=True, dropout_rate=dropout_rate)
+                outputs = self(points, dropout_rate=dropout_rate)
                 loss = self.criterion(outputs, labels)
 
                 # Backward and optimize
@@ -96,15 +98,15 @@ def _print_status(self, epoch, epochs, loss, val_loss):
 
 
 class MLP(BaseMLP):
-    def __init__(self, layer_sizes, l2_reg=1e-5, postprocessing=None, loss=nn.MSELoss,
-                 optimizer=None, activation=None):
+    def __init__(self, layer_sizes, postprocessing=None, loss=nn.MSELoss,
+                 optimizer=None, activation=None, **kwargs):
         if postprocessing is None:
             postprocessing = lambda x: x
 
         if activation is None:
             activation = F.celu
 
-        super(MLP, self).__init__(layer_sizes, activation=activation, postprocessing=postprocessing)
+        super(MLP, self).__init__(layer_sizes, activation=activation, postprocessing=postprocessing, **kwargs)
 
         self.criterion = loss()
 

diff --git a/alpaca/uncertainty_estimator/masks.py b/alpaca/uncertainty_estimator/masks.py
@@ -7,7 +7,6 @@
 from dppy.finite_dpps import FiniteDPP
 
 
-# DEFAULT_MASKS = ['basic_bern', 'decorrelating_sc', 'dpp', 'k_dpp', 'k_dpp_noisereg']
 DEFAULT_MASKS = ['mc_dropout', 'decorrelating_sc', 'dpp', 'k_dpp']
 
 
@@ -85,7 +84,7 @@ def reset(self):
         self.layer_correlations = {}
 
 
-ATTEMPTS = 10
+ATTEMPTS = 30
 
 
 class DPPMask:
@@ -101,6 +100,8 @@ def __call__(self, x, dropout_rate=0.5, layer_num=0):
             x_matrix = x.cpu().numpy()
 
             self.x_matrix = x_matrix
+            micro = 1e-12
+            x_matrix += np.random.random(x_matrix.shape) * micro  # for computational stability
             correlations = np.corrcoef(x_matrix.T)
             self.dpps[layer_num] = FiniteDPP('likelihood', **{'L': correlations})
             self.layer_correlations[layer_num] = correlations

diff --git a/alpaca/uncertainty_estimator/mcdue.py b/alpaca/uncertainty_estimator/mcdue.py
@@ -1,5 +1,6 @@
 import numpy as np
 import torch
+from .masks import build_mask
 
 
 class MCDUE:
@@ -30,6 +31,8 @@ def __init__(self, net, nn_runs=25, dropout_rate=.5, dropout_mask=None, keep_run
         self.net = net
         self.nn_runs = nn_runs
         self.dropout_rate = dropout_rate
+        if isinstance(dropout_mask, str):
+            dropout_mask = build_mask(dropout_mask)
         self.dropout_mask = dropout_mask
         self.keep_runs = keep_runs
         self._mcd_runs = np.array([])

diff --git a/examples/active_learning.ipynb b/examples/active_learning.ipynb
@@ -0,0 +1,36 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}