Skip to content

Commit

Permalink
Added problems 6.14-6.15
Browse files Browse the repository at this point in the history
  • Loading branch information
niuers committed May 31, 2020
1 parent 6c3c847 commit 35684e8
Show file tree
Hide file tree
Showing 6 changed files with 557 additions and 33 deletions.
454 changes: 441 additions & 13 deletions Solutions to Chapter 6 Exercises and Problems.ipynb

Large diffs are not rendered by default.

Binary file added data/usps.h5
Binary file not shown.
Binary file added data/usps.h5.zip
Binary file not shown.
77 changes: 77 additions & 0 deletions libs/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import functools
import h5py
from sklearn.model_selection import StratifiedShuffleSplit


def generate_random_numbers01(N, dim, max_v = 10000):
Expand Down Expand Up @@ -242,3 +244,78 @@ def generate_gmm(means, covs, probs, N):
gs = np.random.multivariate_normal(mean, cov, count)
gaussians[ix] = gs
return gaussians


# USPS Zip Code Data for Handwritten Recognition
def load_zip_data(zip_data_path):
"""Load the USPS zip code data
https://www.kaggle.com/bistaumanga/usps-dataset/data
"""
with h5py.File(zip_data_path, 'r') as hf:
train = hf.get('train')
X_tr = train.get('data')[:]
y_tr = train.get('target')[:]
test = hf.get('test')
X_te = test.get('data')[:]
y_te = test.get('target')[:]

return X_tr, y_tr, X_te, y_te

def sample_zip_data(X, y, train_size, splits):
sss = StratifiedShuffleSplit(n_splits=splits, train_size=train_size, random_state=0)
sss.get_n_splits(X, y)

data_indices = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
data_indices.append([X_train, y_train, X_test, y_test])
return data_indices

def calc_image_symmetry(X, img_w, img_h):
"""We define asymmetry as the average absolute difference between
an image and its flipped versions, and symmetry as the negation of asymmetry
X: Nxd: where N is the number of images, d is the number of pixels
img_w, img_h: Image width and height, e.g. 16x16
Then we have d = img_w x img_h
"""

N, d = X.shape
if d!= img_w*img_h:
raise ValueError("Image width and height don't agree with data.")
Xf = X.reshape(N, img_w, img_h)
Xf = np.flip(Xf, axis=2)
Xf = Xf.reshape(N, d)
asy = np.abs(X - Xf)
asy = np.mean(asy, axis = 1)
sy = -asy
return sy

def calc_image_intensity(X):
"""Compute the average intensity of an image
X: Nxd: where N is the number of images, d is the number of pixels
Return
ret: Nx1 matrix
"""

ret = np.mean(X, axis=1)
return ret


def compute_features(X_train, X_test):
# Compute the symmetry and intensity for images
img_w, img_h = 16, 16
X_tr_sy = calc_image_symmetry(X_train, img_w, img_h)
X_tr_int = calc_image_intensity(X_train)

X_te_sy = calc_image_symmetry(X_test, img_w, img_h)
X_te_int = calc_image_intensity(X_test)

X_tr = np.hstack([X_tr_int.reshape(-1, 1), X_tr_sy.reshape(-1, 1)])
X_te = np.hstack([X_te_int.reshape(-1, 1), X_te_sy.reshape(-1, 1)])
return X_tr, X_te



54 changes: 35 additions & 19 deletions libs/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ def __init__(self, X, y, k, problem_type='classification'):
self.k = k #number of nearest neighbors
self.problem_type = problem_type

def find_nn_idx(self, x):
# Find the indexes of nearest neighbors for x
def find_nn_idx(self, x, k):
# Find the indexes of k nearest neighbors for x
distances = dist(x, self.X).ravel()
order = np.argsort(np.array(distances))
return order[:self.k]
return order[:k]

def find_nn(self, x):
# Find the nearest neighbors for x
Expand Down Expand Up @@ -120,18 +120,19 @@ def init_cnn(self, X):
S_idx = np.random.choice(N, self.k)
return S_idx

def find_inconsistency(self, X, y, cnn):
def find_inconsistency(self, X, y, cnn, onn):
#Is the condensed set training data consistent?
found = False

found = False
for ix, x1 in enumerate(X): # It can be a point in S as well
x1 = x1.reshape(1, -1)
y1 = cnn.predict_one(x1) # O(K)
if y1 != y[ix]:
yo = onn.predict_one(x1)
if y1 != yo:
found = True
#print('Found diff:', ix, x1, y1, yo)
break
inconsistent_idx = ix if found else None
return inconsistent_idx
return inconsistent_idx, x1, yo

def setup_cnn(self, X, y, S_idx):
# Build a NearestNeighbors classifier based on
Expand All @@ -142,15 +143,14 @@ def setup_cnn(self, X, y, S_idx):
cnn = NearestNeighbors(S, ys, self.k)
return cnn

def augment_S(self, X, y, inconsistent_idx, S_idx):
N, d = X.shape
def augment_S(self, X, y, inconsistent_y, neighbors_idx, S_idx):
# The purpose is to find a point different from
# and nearest to inconsistent_idx
nn = NearestNeighbors(X, y, N)
inconsistent_x = X[inconsistent_idx, :].reshape(-1, d)
inconsistent_y = y[inconsistent_idx]
# Find the neighbors from nearest to farest
neighbors_idx = nn.find_nn_idx(inconsistent_x)

#inconsistent_x = X[inconsistent_idx, :].reshape(-1, d)
# inconsistent_y = y[inconsistent_idx] #This is wrong, should be the y prediced by onn
#inconsistent_y = onn.predict_one(inconsistent_x)

found = False
for ix in neighbors_idx:
if ix in S_idx: #Find x' not in S already
Expand All @@ -159,18 +159,34 @@ def augment_S(self, X, y, inconsistent_idx, S_idx):
found = True
break
if found:
#print('Found a new idx: ', ix)
S_idx = np.append(S_idx, ix)
else:
print("Can't find a new idx.")
return S_idx

def find_cnn(self, X, y):

S_idx = self.init_cnn(X)
N, _ = X.shape
S_idx = self.init_cnn(X)
onn = NearestNeighbors(X, y, self.k)
while True:
old_s = len(S_idx)
#print('Size of S_idx: ', old_s)
cnn = self.setup_cnn(X, y, S_idx)
inconsistent_idx = self.find_inconsistency(X, y, cnn)
inconsistent_idx, inconsistent_x, inconsistent_y = self.find_inconsistency(X, y, cnn, onn)
#print('inconsistent idx: ', inconsistent_idx)
if inconsistent_idx is None:
break
S_idx = self.augment_S(X, y, inconsistent_idx, S_idx)
# Find the neighbors from nearest to farest
neighbors_idx = onn.find_nn_idx(inconsistent_x, N)
#print('Input inconsistency: ', inconsistent_idx, inconsistent_y)
#print('NUmber of neighbors: ', len(neighbors_idx), neighbors_idx[:10])

S_idx = self.augment_S(X, y, inconsistent_y, neighbors_idx, S_idx)
if len(S_idx) == old_s:
print('No new point added into S. Exit.')
break
#print('Final S_idx: ', S_idx)
S = X[S_idx, :]
Sy = y[S_idx]
return S_idx, S, Sy
Expand Down
5 changes: 4 additions & 1 deletion libs/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,7 @@ def plot_decision_boundaries(xx1, xx2, num_cats, classifier, transformer = None,
y = y.reshape(xx1.shape)
plt.contourf(xx1, xx2, y, alpha=alpha, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
plt.ylim(xx2.min(), xx2.max())



0 comments on commit 35684e8

Please sign in to comment.