From c97f35d6e999694d8c5db7a2b80140a961f394c9 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Fri, 10 Jul 2020 10:38:32 -0400 Subject: [PATCH] Added problems 9.1 --- ...o Chapter 6 Similarity-Based Methods.ipynb | 33 +--- Solutions to Chapter 9 Learning Aides.ipynb | 182 +++++++++++++++++- libs/data_util.py | 55 ++++++ libs/nn.py | 7 +- 4 files changed, 244 insertions(+), 33 deletions(-) diff --git a/Solutions to Chapter 6 Similarity-Based Methods.ipynb b/Solutions to Chapter 6 Similarity-Based Methods.ipynb index 2684080..0f9ff11 100644 --- a/Solutions to Chapter 6 Similarity-Based Methods.ipynb +++ b/Solutions to Chapter 6 Similarity-Based Methods.ipynb @@ -1377,31 +1377,6 @@ "#### Problem 6.14 (a) Prepare Zip Code Data" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def split_zip_data(zip_data_path, splits = 1, train_size = 500):\n", - " # Split the raw data into train and test\n", - " # splits: specify the number of random splits for each train-test pair\n", - " X_tr, y_tr, X_te, y_te = data.load_zip_data(zip_data_path)\n", - " train_size = train_size\n", - " splits = splits\n", - " data_splits = data.sample_zip_data(X_tr, y_tr, train_size, splits)\n", - " return data_splits\n", - "\n", - "def set_two_classes(y_train, y_test, digit): \n", - " # Classify digit '1' vs. not '1'\n", - " y_train[y_train==digit] = 1\n", - " y_test[y_test==digit] = 1\n", - " \n", - " y_train[y_train!=digit] = -1\n", - " y_test[y_test!=digit] = -1\n", - " return y_train, y_test" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -1419,7 +1394,7 @@ ], "source": [ "zip_data_path = './data/usps.h5'\n", - "data_splits = split_zip_data(zip_data_path, splits = 1)\n", + "data_splits = data.split_zip_data(zip_data_path, splits = 1)\n", "\n", "X_train, y_train, X_test, y_test = data_splits[0]\n", "\n", @@ -1428,7 +1403,7 @@ "freqs = counts/len(y_train)\n", "print('Frequencies of the digits: \\n', dict(zip(unique, freqs)))\n", "\n", - "y_train, y_test = set_two_classes(y_train, y_test, 1)" + "y_train, y_test = data.set_two_classes(y_train, y_test, 1)" ] }, { @@ -1692,7 +1667,7 @@ "k=3\n", "tot_exps = 1000 \n", "zip_data_path = './data/usps.h5'\n", - "data_splits = split_zip_data(zip_data_path, splits = tot_exps)\n", + "data_splits = data.split_zip_data(zip_data_path, splits = tot_exps)\n", "digit = 1 #we classify digit '1' vs. non '1'\n", "nn_Eins, nn_Eouts = [], []\n", "cnn_Eins, cnn_Eouts = [], []\n", @@ -1700,7 +1675,7 @@ " if (it + 100) % 100 == 0:\n", " print('---- Working on iteration: ', it)\n", " X_train, y_train, X_test, y_test = data_splits[it]\n", - " y_train, y_test = set_two_classes(y_train, y_test, digit)\n", + " y_train, y_test = data.set_two_classes(y_train, y_test, digit)\n", " X_tr, X_te = data.compute_features(X_train, X_test)\n", " \n", " nn_cls = nn.NearestNeighbors(X_tr, y_train, k)\n", diff --git a/Solutions to Chapter 9 Learning Aides.ipynb b/Solutions to Chapter 9 Learning Aides.ipynb index a32e22e..d0c22ac 100644 --- a/Solutions to Chapter 9 Learning Aides.ipynb +++ b/Solutions to Chapter 9 Learning Aides.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,143 @@ "\n", "#### Exercise 9.18 TODO\n", "\n", - "#### Problem 9.1 TODO" + "#### Problem 9.1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#### Problem 9.1\n", + "\n", + "df = pd.DataFrame({'x1':[0, 0, 5], 'x2':[0, 1, 5], 'y':[1, 1, -1]})\n", + "xsp1 = df.loc[df['y']==1]['x1'].values\n", + "ysp1 = df.loc[df['y']==1]['x2'].values\n", + "xsm1 = df.loc[df['y']==-1]['x1'].values\n", + "ysm1 = df.loc[df['y']==-1]['x2'].values\n", + "\n", + "#plt.tight_layout()\n", + "X_train = df[['x1', 'x2']].values\n", + "y_train = df['y'].values\n", + "cls = nn.NearestNeighbors(X_train, y_train, 1)\n", + "x1_min, x1_max = -1, 6\n", + "x2_min, x2_max = -1, 6\n", + "xx1, xx2 = myplot.get_grid(x1_min, x1_max, x2_min, x2_max, step=0.02)\n", + "myplot.plot_decision_boundaries(xx1, xx2, 2, cls)\n", + "\n", + "myplot.plt_plot([xsp1, xsm1], [ysp1, ysm1], 'scatter', \n", + " colors = ['r', 'b'], markers = ['o', '+'], labels = ['+1', '-1'], \n", + " title = \"Problem 9.1 (a) 1-Nearest Neighbor\", yscale = None, ylb = -1, yub = 6,\n", + " xlb = -1, xub = 6, xlabel = None, ylabel = None,\n", + " legends = ['+1', '-1'], legendx = None, legendy = None, marker_sizes=[25, 25])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transformed data points: [[ 0.18441744 -1.40213773]\n", + " [-1.30649561 0.54135868]\n", + " [ 1.12207817 0.86077905]]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#### Problem 9.1 (b)\n", + "Z_train = data.input_whitening(X_train)\n", + "print(f\"Transformed data points: {Z_train}\")\n", + "cls = nn.NearestNeighbors(Z_train, y_train, 1, 'classification')\n", + "x1_min, x1_max = -1, 6\n", + "x2_min, x2_max = -1, 6\n", + "xx1, xx2 = myplot.get_grid(x1_min, x1_max, x2_min, x2_max, step=0.02)\n", + "myplot.plot_decision_boundaries(xx1, xx2, 2, cls, data.input_whitening)\n", + "\n", + "myplot.plt_plot([xsp1, xsm1], [ysp1, ysm1], 'scatter', \n", + " colors = ['r', 'b'], markers = ['o', '+'], labels = ['+1', '-1'], \n", + " title = \"Problem 9.1 (b) Whitening + 1-Nearest Neighbor\", yscale = None, ylb = -1, yub = 6,\n", + " xlb = -1, xub = 6, xlabel = None, ylabel = None,\n", + " legends = ['+1', '-1'], legendx = None, legendy = None, marker_sizes=[25, 25])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transformed data points: [[0. ]\n", + " [0.67507785]\n", + " [7.06412174]]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#### Problem 9.1 (c)\n", + "\n", + "def pca_transformer(X):\n", + " Z, _, _ = data.pca(X, 1)\n", + " return Z\n", + "\n", + "Z_train = pca_transformer(X_train)\n", + "print(f\"Transformed data points: {Z_train}\")\n", + "cls = nn.NearestNeighbors(Z_train, y_train, 1, 'classification')\n", + "x1_min, x1_max = -1, 6\n", + "x2_min, x2_max = -1, 6\n", + "xx1, xx2 = myplot.get_grid(x1_min, x1_max, x2_min, x2_max, step=0.1)\n", + "myplot.plot_decision_boundaries(xx1, xx2, 2, cls, pca_transformer)\n", + "\n", + "myplot.plt_plot([xsp1, xsm1], [ysp1, ysm1], 'scatter', \n", + " colors = ['r', 'b'], markers = ['o', '+'], labels = ['+1', '-1'], \n", + " title = \"Problem 9.1 (c) Top 1 PCA + 1-Nearest Neighbor\", yscale = None, ylb = -1, yub = 6,\n", + " xlb = -1, xub = 6, xlabel = None, ylabel = None,\n", + " legends = ['+1', '-1'], legendx = None, legendy = None, marker_sizes=[25, 25])" ] }, { @@ -465,6 +601,48 @@ "#### Problem 9.6 TODO" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add lib input sys.path\n", + "import os\n", + "import sys\n", + "import time\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn\n", + "import matplotlib.pyplot as plt\n", + "from scipy.optimize import minimize\n", + "import math\n", + "from sklearn.preprocessing import normalize\n", + "from functools import partial\n", + "import h5py\n", + "from scipy.spatial import distance\n", + "\n", + "nb_dir = os.path.split(os.getcwd())[0]\n", + "if nb_dir not in sys.path:\n", + " sys.path.append(nb_dir)\n", + "\n", + "from matplotlib.colors import ListedColormap\n", + "import libs.linear_models as lm\n", + "import libs.data_util as data\n", + "import libs.nn as nn\n", + "import libs.plot as myplot\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/libs/data_util.py b/libs/data_util.py index 1006dca..03d6e17 100644 --- a/libs/data_util.py +++ b/libs/data_util.py @@ -4,6 +4,7 @@ import functools import h5py from sklearn.model_selection import StratifiedShuffleSplit +from scipy.linalg import sqrtm def generate_random_numbers01(N, dim, max_v = 10000): @@ -272,6 +273,25 @@ def sample_zip_data(X, y, train_size, splits): data_indices.append([X_train, y_train, X_test, y_test]) return data_indices +# Deal with ZIP code data +def split_zip_data(zip_data_path, splits = 1, train_size = 500): + # Split the raw data into train and test + # splits: specify the number of random splits for each train-test pair + X_tr, y_tr, X_te, y_te = load_zip_data(zip_data_path) + train_size = train_size + splits = splits + data_splits = sample_zip_data(X_tr, y_tr, train_size, splits) + return data_splits + +def set_two_classes(y_train, y_test, digit): + # Classify digit '1' vs. not '1' + y_train[y_train==digit] = 1 + y_test[y_test==digit] = 1 + + y_train[y_train!=digit] = -1 + y_test[y_test!=digit] = -1 + return y_train, y_test + def calc_image_symmetry(X, img_w, img_h): """We define asymmetry as the average absolute difference between an image and its flipped versions, and symmetry as the negation of asymmetry @@ -319,3 +339,38 @@ def compute_features(X_train, X_test): +# Input Centering +def input_centering(X): + # Make the mean of X to be zero + N, _ = X.shape + mean_x = np.mean(X, axis = 0).reshape(1, -1) + ones = np.ones((N,1)) + Z = X - np.matmul(ones, mean_x) + return Z + +def input_whitening(X): + # Center the data first + N, _ = X.shape + XX = input_centering(X) + COV = np.matmul(XX.transpose(), XX)/N + sqrt_COV = sqrtm(COV) + Z = np.matmul(XX, np.linalg.inv(sqrt_COV)) + return Z + +def pca(X, top_k, center_first = True): + #PAC dimension reduction to top_k + if top_k < 1: + raise ValueError(f"The reduced dimension {top_k} has to be larger than 0") + + N, d = X.shape + if center_first: + XX = input_centering(X) + else: + XX = X + U, S, V = np.linalg.svd(XX) + Vk = V[:, :top_k] + Z = np.matmul(X, Vk) + X_hat = np.matmul(X, Vk) + X_hat = np.matmul(X_hat, Vk.transpose()) + return Z, X_hat, S + diff --git a/libs/nn.py b/libs/nn.py index 0c875c9..acfd42e 100644 --- a/libs/nn.py +++ b/libs/nn.py @@ -65,12 +65,13 @@ def find_nn_idx(x, X, k): return order[:k], distances[order[:k]] class NearestNeighbors: - def __init__(self, X, y, k, problem_type='classification'): + def __init__(self, X, y, k, problem_type='classification', transformer=None): #X: Nxd matrix, where each row corresponds to a data point x in R^d self.X = X self.y = y self.k = k #number of nearest neighbors self.problem_type = problem_type + self.transformer = transformer def find_nn_idx(self, x, k): # Find the indexes of k nearest neighbors for x @@ -101,7 +102,9 @@ def predict_one(self, x): def predict(self, X): # Predict the y for input X: Mxd matrix - + if self.transformer is not None: + X = self.transformer(X) + M, _ = X.shape predicted = [] for idx in np.arange(M):