From e799f1fc6c4b5121a0c77dd72c2e1d8ba6313d45 Mon Sep 17 00:00:00 2001 From: Awantik Das Date: Thu, 29 Nov 2018 17:17:35 +0530 Subject: [PATCH] Add files via upload --- ...dy 1 - Employee Attrition Prediction.ipynb | 1106 +++++++++++++++++ ... 2 - Review Sentiment Classification.ipynb | 496 ++++++++ Case Study Churn Prediction.ipynb | 1064 ++++++++++++++++ Case Study Clothe Identification.ipynb | 216 ++++ 4 files changed, 2882 insertions(+) create mode 100644 Case Study 1 - Employee Attrition Prediction.ipynb create mode 100644 Case Study 2 - Review Sentiment Classification.ipynb create mode 100644 Case Study Churn Prediction.ipynb create mode 100644 Case Study Clothe Identification.ipynb diff --git a/Case Study 1 - Employee Attrition Prediction.ipynb b/Case Study 1 - Employee Attrition Prediction.ipynb new file mode 100644 index 0000000..5e529e0 --- /dev/null +++ b/Case Study 1 - Employee Attrition Prediction.ipynb @@ -0,0 +1,1106 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "emp_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
satisfaction_levellast_evaluationnumber_projectaverage_montly_hourstime_spend_companyWork_accidentleftpromotion_last_5yearssalessalary
00.380.5321573010saleslow
10.800.8652626010salesmedium
20.110.8872724010salesmedium
30.720.8752235010saleslow
40.370.5221593010saleslow
\n", + "
" + ], + "text/plain": [ + " satisfaction_level last_evaluation number_project average_montly_hours \\\n", + "0 0.38 0.53 2 157 \n", + "1 0.80 0.86 5 262 \n", + "2 0.11 0.88 7 272 \n", + "3 0.72 0.87 5 223 \n", + "4 0.37 0.52 2 159 \n", + "\n", + " time_spend_company Work_accident left promotion_last_5years sales \\\n", + "0 3 0 1 0 sales \n", + "1 6 0 1 0 sales \n", + "2 4 0 1 0 sales \n", + "3 5 0 1 0 sales \n", + "4 3 0 1 0 sales \n", + "\n", + " salary \n", + "0 low \n", + "1 medium \n", + "2 medium \n", + "3 low \n", + "4 low " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emp_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "emp_data.rename(columns={'sales':'department'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "satisfaction_level -0.388375\n", + "last_evaluation 0.006567\n", + "number_project 0.023787\n", + "average_montly_hours 0.071287\n", + "time_spend_company 0.144822\n", + "Work_accident -0.154622\n", + "left 1.000000\n", + "promotion_last_5years -0.061788\n", + "Name: left, dtype: float64" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emp_data.corr()['left']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Selecting categorical columns & integer columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "cat_emp_data = emp_data.select_dtypes('object')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "int_emp_data = emp_data.select_dtypes('int64')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preprocessing Categorical Columns" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "le = LabelEncoder()\n", + "ohe = OneHotEncoder()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LabelEncoder()" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "le.fit(cat_emp_data.department)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "cat_emp_data['department_tf'] = le.transform(cat_emp_data.department)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
departmentsalarydepartment_tf
0saleslow7
1salesmedium7
2salesmedium7
3saleslow7
4saleslow7
\n", + "
" + ], + "text/plain": [ + " department salary department_tf\n", + "0 sales low 7\n", + "1 sales medium 7\n", + "2 sales medium 7\n", + "3 sales low 7\n", + "4 sales low 7" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_emp_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing',\n", + " 'product_mng', 'sales', 'support', 'technical'], dtype=object)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "le.classes_" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['sales'], dtype=object)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "le.inverse_transform([7])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:363: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", + "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", + "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "OneHotEncoder(categorical_features=None, categories=None,\n", + " dtype=, handle_unknown='error',\n", + " n_values='auto', sparse=True)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ohe.fit(cat_emp_data[['department_tf']])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "department_tf = ohe.transform(cat_emp_data[['department_tf']]).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import FunctionTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def func(x):\n", + " def mapping(d):\n", + " if d == 'low':\n", + " return 1\n", + " elif d == 'medium':\n", + " return 2\n", + " else:\n", + " return 3\n", + " return x.map( mapping )\n", + " \n", + "ft = FunctionTransformer(func, validate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "cat_emp_data['salary_tf'] = ft.transform(cat_emp_data.salary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preprocessing Number Data" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
number_projectaverage_montly_hourstime_spend_companyWork_accidentleftpromotion_last_5years
021573010
152626010
272724010
352235010
421593010
\n", + "
" + ], + "text/plain": [ + " number_project average_montly_hours time_spend_company Work_accident \\\n", + "0 2 157 3 0 \n", + "1 5 262 6 0 \n", + "2 7 272 4 0 \n", + "3 5 223 5 0 \n", + "4 2 159 3 0 \n", + "\n", + " left promotion_last_5years \n", + "0 1 0 \n", + "1 1 0 \n", + "2 1 0 \n", + "3 1 0 \n", + "4 1 0 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int_emp_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "int_emp_data.drop('left',axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "mms = MinMaxScaler()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\data.py:323: DataConversionWarning: Data with input dtype int64 were all converted to float64 by MinMaxScaler.\n", + " return self.partial_fit(X, y)\n" + ] + }, + { + "data": { + "text/plain": [ + "MinMaxScaler(copy=True, feature_range=(0, 1))" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mms.fit(int_emp_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "int_tf = mms.transform(int_emp_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0. , 0.28504673, 0.125 , 0. , 0. ],\n", + " [0.6 , 0.77570093, 0.5 , 0. , 0. ],\n", + " [1. , 0.82242991, 0.25 , 0. , 0. ],\n", + " ...,\n", + " [0. , 0.21962617, 0.125 , 0. , 0. ],\n", + " [0.8 , 0.85981308, 0.25 , 0. , 0. ],\n", + " [0. , 0.28971963, 0.125 , 0. , 0. ]])" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int_tf" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "float_tf = emp_data[['satisfaction_level','last_evaluation']].values" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(14999,)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_emp_data['salary_tf'].values.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(14999, 5)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "int_tf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "department_tf[:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Joining the data" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "salary_tf = cat_emp_data['salary_tf'].values.reshape(-1,1)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "feature_data = np.hstack([department_tf,int_tf,float_tf,salary_tf])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "target_data = emp_data.left" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Split data into two parts" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "trainX, testX, trainY, testY = train_test_split(feature_data,target_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "lr = LogisticRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='warn',\n", + " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", + " tol=0.0001, verbose=0, warm_start=False)" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.fit(trainX,trainY)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "rf = RandomForestClassifier()" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", + " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf.fit(trainX,trainY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.score(testX,testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9898666666666667" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf.score(testX,testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import recall_score,precision_score, f1_score, classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "pred = rf.predict(testX)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9904191616766467" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision_score(y_pred=pred, y_true=testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9775413711583923" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f1_score(y_pred=pred, y_true=testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.99 1.00 0.99 2893\n", + " 1 0.99 0.96 0.98 857\n", + "\n", + " micro avg 0.99 0.99 0.99 3750\n", + " macro avg 0.99 0.98 0.99 3750\n", + "weighted avg 0.99 0.99 0.99 3750\n", + "\n" + ] + } + ], + "source": [ + "print (classification_report(y_pred=pred, y_true=testY))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Case Study 2 - Review Sentiment Classification.ipynb b/Case Study 2 - Review Sentiment Classification.ipynb new file mode 100644 index 0000000..78f1746 --- /dev/null +++ b/Case Study 2 - Review Sentiment Classification.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "review_data = pd.read_csv('C:/Users/awant/Documents/Reviews.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 568454 entries, 0 to 568453\n", + "Data columns (total 10 columns):\n", + "Id 568454 non-null int64\n", + "ProductId 568454 non-null object\n", + "UserId 568454 non-null object\n", + "ProfileName 568438 non-null object\n", + "HelpfulnessNumerator 568454 non-null int64\n", + "HelpfulnessDenominator 568454 non-null int64\n", + "Score 568454 non-null int64\n", + "Time 568454 non-null int64\n", + "Summary 568427 non-null object\n", + "Text 568454 non-null object\n", + "dtypes: int64(5), object(5)\n", + "memory usage: 43.4+ MB\n" + ] + } + ], + "source": [ + "review_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "review_data = review_data.sample(5000)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "review_data = review_data[['Text','Score']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextScore
22098We've had this for almost a week, and the dog ...4
295111I just have to put my two cents in on this tea...5
125547The Hanover Pumpernickle & Onion Pretzels are ...5
350934January 3, 2011<br /><br />I purchased these p...4
502462I received these and added them to the collect...5
\n", + "
" + ], + "text/plain": [ + " Text Score\n", + "22098 We've had this for almost a week, and the dog ... 4\n", + "295111 I just have to put my two cents in on this tea... 5\n", + "125547 The Hanover Pumpernickle & Onion Pretzels are ... 5\n", + "350934 January 3, 2011

I purchased these p... 4\n", + "502462 I received these and added them to the collect... 5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "review_data = review_data[review_data.Score != 3]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "review_data['Sentiment'] = review_data.Score.map(lambda x: 0 if x < 3 else 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextScoreSentiment
22098We've had this for almost a week, and the dog ...41
\n", + "
" + ], + "text/plain": [ + " Text Score Sentiment\n", + "22098 We've had this for almost a week, and the dog ... 4 1" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_data.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "cv = CountVectorizer(stop_words='english')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import MultinomialNB" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "mnb = MultinomialNB()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pipeline Creation" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import SelectKBest" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "word_selector = SelectKBest(k=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "review_pipeline = make_pipeline(cv, word_selector, mnb)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('countvectorizer',\n", + " CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None, stop_words='english',\n", + " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", + " tokenizer=None, vocabulary=None)),\n", + " ('selectkbest',\n", + " SelectKBest(k=2000, score_func=)),\n", + " ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_pipeline.steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Splitting data into train & test" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "trainX,testX, trainY,testY = train_test_split(review_data.Text, review_data.Sentiment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None, stop_words='english...x000001C7C8242BF8>)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_pipeline.fit(trainX,trainY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Model Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8871527777777778" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_pipeline.score(testX,testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0], dtype=int64)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_pipeline.predict(['Foul Bad Worse'])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1], dtype=int64)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_pipeline.predict(['Good food here'])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 3837\n", + "0 769\n", + "Name: Sentiment, dtype: int64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "review_data.Sentiment.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Case Study Churn Prediction.ipynb b/Case Study Churn Prediction.ipynb new file mode 100644 index 0000000..39fa1a1 --- /dev/null +++ b/Case Study Churn Prediction.ipynb @@ -0,0 +1,1064 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "churn_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/churn.csv.txt',\n", + " parse_dates=['last_trip_date','signup_date'] )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 50000 entries, 0 to 49999\n", + "Data columns (total 12 columns):\n", + "avg_dist 50000 non-null float64\n", + "avg_rating_by_driver 49799 non-null float64\n", + "avg_rating_of_driver 41878 non-null float64\n", + "avg_surge 50000 non-null float64\n", + "city 50000 non-null object\n", + "last_trip_date 50000 non-null datetime64[ns]\n", + "phone 49604 non-null object\n", + "signup_date 50000 non-null datetime64[ns]\n", + "surge_pct 50000 non-null float64\n", + "trips_in_first_30_days 50000 non-null int64\n", + "luxury_car_user 50000 non-null bool\n", + "weekday_pct 50000 non-null float64\n", + "dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)\n", + "memory usage: 4.2+ MB\n" + ] + } + ], + "source": [ + "churn_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
avg_distavg_rating_by_driveravg_rating_of_driveravg_surgecitylast_trip_datephonesignup_datesurge_pcttrips_in_first_30_daysluxury_car_userweekday_pct
03.675.04.71.10King's Landing2014-06-17iPhone2014-01-2515.44True46.2
18.265.05.01.00Astapor2014-05-05Android2014-01-290.00False50.0
20.775.04.31.00Astapor2014-01-07iPhone2014-01-060.03False100.0
32.364.94.61.14King's Landing2014-06-29iPhone2014-01-1020.09True80.0
43.134.94.41.19Winterfell2014-03-15Android2014-01-2711.814False82.4
\n", + "
" + ], + "text/plain": [ + " avg_dist avg_rating_by_driver avg_rating_of_driver avg_surge \\\n", + "0 3.67 5.0 4.7 1.10 \n", + "1 8.26 5.0 5.0 1.00 \n", + "2 0.77 5.0 4.3 1.00 \n", + "3 2.36 4.9 4.6 1.14 \n", + "4 3.13 4.9 4.4 1.19 \n", + "\n", + " city last_trip_date phone signup_date surge_pct \\\n", + "0 King's Landing 2014-06-17 iPhone 2014-01-25 15.4 \n", + "1 Astapor 2014-05-05 Android 2014-01-29 0.0 \n", + "2 Astapor 2014-01-07 iPhone 2014-01-06 0.0 \n", + "3 King's Landing 2014-06-29 iPhone 2014-01-10 20.0 \n", + "4 Winterfell 2014-03-15 Android 2014-01-27 11.8 \n", + "\n", + " trips_in_first_30_days luxury_car_user weekday_pct \n", + "0 4 True 46.2 \n", + "1 0 False 50.0 \n", + "2 3 False 100.0 \n", + "3 9 True 80.0 \n", + "4 14 False 82.4 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "churn_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2014-07-01 00:00:00')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "churn_data.last_trip_date.max()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2014-01-01 00:00:00')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "churn_data.last_trip_date.min()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2014-06-01 00:00:00')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cutoff" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "churn_data['churn'] = churn_data.last_trip_date.map(lambda d: 1 if d > cutoff else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
avg_distavg_rating_by_driveravg_rating_of_driveravg_surgecitylast_trip_datephonesignup_datesurge_pcttrips_in_first_30_daysluxury_car_userweekday_pctchurn
03.675.04.71.10King's Landing2014-06-17iPhone2014-01-2515.44True46.21
18.265.05.01.00Astapor2014-05-05Android2014-01-290.00False50.00
20.775.04.31.00Astapor2014-01-07iPhone2014-01-060.03False100.00
32.364.94.61.14King's Landing2014-06-29iPhone2014-01-1020.09True80.01
43.134.94.41.19Winterfell2014-03-15Android2014-01-2711.814False82.40
\n", + "
" + ], + "text/plain": [ + " avg_dist avg_rating_by_driver avg_rating_of_driver avg_surge \\\n", + "0 3.67 5.0 4.7 1.10 \n", + "1 8.26 5.0 5.0 1.00 \n", + "2 0.77 5.0 4.3 1.00 \n", + "3 2.36 4.9 4.6 1.14 \n", + "4 3.13 4.9 4.4 1.19 \n", + "\n", + " city last_trip_date phone signup_date surge_pct \\\n", + "0 King's Landing 2014-06-17 iPhone 2014-01-25 15.4 \n", + "1 Astapor 2014-05-05 Android 2014-01-29 0.0 \n", + "2 Astapor 2014-01-07 iPhone 2014-01-06 0.0 \n", + "3 King's Landing 2014-06-29 iPhone 2014-01-10 20.0 \n", + "4 Winterfell 2014-03-15 Android 2014-01-27 11.8 \n", + "\n", + " trips_in_first_30_days luxury_car_user weekday_pct churn \n", + "0 4 True 46.2 1 \n", + "1 0 False 50.0 0 \n", + "2 3 False 100.0 0 \n", + "3 9 True 80.0 1 \n", + "4 14 False 82.4 0 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "churn_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "cat_cols = list(churn_data.select_dtypes('object').columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['city', 'phone']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "num_cols = list(churn_data.select_dtypes('float64').columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "num_cols.append('trips_in_first_30_days')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['avg_dist',\n", + " 'avg_rating_by_driver',\n", + " 'avg_rating_of_driver',\n", + " 'avg_surge',\n", + " 'surge_pct',\n", + " 'weekday_pct',\n", + " 'trips_in_first_30_days']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_cols" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pipeline Creation for categorical columns & numerical columns" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import OneHotEncoder, MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'),\n", + " OneHotEncoder())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "cat_data_tf = cat_pipeline.fit_transform(churn_data[cat_cols]).toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline = make_pipeline(SimpleImputer(strategy='median'),\n", + " MinMaxScaler()) " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "num_data_tf = num_pipeline.fit_transform(churn_data[num_cols])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "feature_data = np.hstack([cat_data_tf, num_data_tf, churn_data.luxury_car_user.reshape(-1,1)])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(50000, 14)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "target_data = churn_data.churn" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_selection import SelectKBest\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "final_pipeline = make_pipeline(SelectKBest(k=8), RandomForestClassifier(n_estimators=20))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "trainX, testX, trainY, testY = train_test_split(feature_data,target_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('selectkbest', SelectKBest(k=8, score_func=)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decre...obs=None,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_pipeline.fit(trainX,trainY)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.66072" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_pipeline.score(testX,testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "params = {\n", + " 'selectkbest__k':[5,8,12],\n", + " 'randomforestclassifier__n_estimators':[10,30,50]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "grid_model = GridSearchCV(final_pipeline,param_grid=params, cv=5, n_jobs=-1)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise-deprecating',\n", + " estimator=Pipeline(memory=None,\n", + " steps=[('selectkbest', SelectKBest(k=8, score_func=)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decre...obs=None,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))]),\n", + " fit_params=None, iid='warn', n_jobs=-1,\n", + " param_grid={'selectkbest__k': [5, 8, 12], 'randomforestclassifier__n_estimators': [10, 30, 50]},\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", + " scoring=None, verbose=0)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.fit(trainX,trainY)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'randomforestclassifier__n_estimators': 50, 'selectkbest__k': 12}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7619733333333333" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7592" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.score(testX,testY)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import ColumnTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['avg_dist',\n", + " 'avg_rating_by_driver',\n", + " 'avg_rating_of_driver',\n", + " 'avg_surge',\n", + " 'surge_pct',\n", + " 'weekday_pct',\n", + " 'trips_in_first_30_days']" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_cols" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using column transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', num_pipeline, num_cols),\n", + " ('cat', cat_pipeline, cat_cols)])" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "pipeline = Pipeline(steps=[('preprocessor',preprocessor),\n", + " ('selectkbest', SelectKBest(k=8)),\n", + " ('classifier',RandomForestClassifier(n_estimators=20))])" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "#pipeline = make_pipeline(preprocessor, SelectKBest(k=8), RandomForestClassifier(n_estimators=20))" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,\n", + " strategy='median', verbose=0)), ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1)))])" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "params = {\n", + " 'preprocessor__num__simpleimputer__strategy':['mean','median'],\n", + " 'selectkbest__k':[12,13],\n", + " 'classifier__n_estimators':[50,70]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "grid_model = GridSearchCV(pipeline, param_grid=params,cv=5, n_jobs=-1)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "feature_data = churn_data.drop('churn',axis=1)\n", + "target_data = churn_data.churn\n", + "trainX, testX, trainY, testY = train_test_split(feature_data,target_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise-deprecating',\n", + " estimator=Pipeline(memory=None,\n", + " steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n", + " transformer_weights=None,\n", + " transformers=[('num', Pipeline(memory=None,\n", + " steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,\n", + " strategy='median',...obs=None,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False))]),\n", + " fit_params=None, iid='warn', n_jobs=-1,\n", + " param_grid={'preprocessor__num__simpleimputer__strategy': ['mean', 'median'], 'selectkbest__k': [12, 13], 'classifier__n_estimators': [50, 70]},\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", + " scoring=None, verbose=0)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.fit(trainX,trainY)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7528" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'classifier__n_estimators': 70,\n", + " 'preprocessor__num__simpleimputer__strategy': 'mean',\n", + " 'selectkbest__k': 12}" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_model.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Case Study Clothe Identification.ipynb b/Case Study Clothe Identification.ipynb new file mode 100644 index 0000000..6b434d8 --- /dev/null +++ b/Case Study Clothe Identification.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "fashion_mnist = pd.read_csv('Data/fashion-mnist_train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60000, 785)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fashion_mnist.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "img = fashion_mnist[1:2].values[:,1:].reshape(28,28)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEPBJREFUeJzt3XuM1eWdx/HPV0BBLiLKLUKXbgURiVJDcLWNlxCqS0yEP2rqX9TdSGM0scY/JCamJhuTZrPt7kZjExpJMba2NWI1uq4WgqVew4go3uoFUEdwhosXUEAYvvvH/NhMcX7f53Buv4PP+5WQOXO+85zzcGY+c86Z7+/5PebuApCfE6qeAIBqEH4gU4QfyBThBzJF+IFMEX4gU4QfyBThBzJF+IFMDW3nnZkZhxMCLebuVsvXNfTMb2ZXmNnfzOxdM1vWyG0BaC+r99h+Mxsi6W1JCyR1S1ov6Rp3fyMYwzM/0GLteOafJ+ldd9/s7l9J+r2kqxq4PQBt1Ej4z5D04YDPu4vr/o6ZLTWzLjPrauC+ADRZI3/wG+ylxdde1rv7cknLJV72A52kkWf+bklTB3w+RdK2xqYDoF0aCf96SdPN7NtmdqKkH0l6tDnTAtBqdb/sd/dDZnajpCclDZG0wt1fb9rMALRU3a2+uu6M9/xAy7XlIB8Axy/CD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kCnCD2SK8AOZIvxApgg/kKm6t+iWJDPbKmmPpD5Jh9x9bjMmheYxq2nD1lLt3MX5aIsXLw7rzzzzTFjfsWNHaS31uKT+342O7wQNhb9wmbvvbMLtAGgjXvYDmWo0/C7pKTN7ycyWNmNCANqj0Zf933P3bWY2QdKfzewtd1838AuKXwr8YgA6TEPP/O6+rfjYK+lhSfMG+Zrl7j6XPwYCnaXu8JvZSDMbfeSypB9Ieq1ZEwPQWo287J8o6eGi5TFU0u/c/X+bMisALVd3+N19s6TzmjgX1CnqOXdyv3nMmDFh/dZbbw3rW7ZsCetRn7/Rx6WTH9da0eoDMkX4gUwRfiBThB/IFOEHMkX4gUxZO1sWZnb890fqcMIJ8e/YVn4POnlp6n333RfWJ0yYENZ37doV1m+66abS2s6d8ULURpdCp77n0e339fWFY1PfE3evafI88wOZIvxApgg/kCnCD2SK8AOZIvxApgg/kKlmnL0XCYcPH27p7Uc941S/OTW3RscvW7astDZ+/Phw7AcffBDW586NTw41atSo0lqqzz90aGPROHjwYEPj24FnfiBThB/IFOEHMkX4gUwRfiBThB/IFOEHMkWf/xsg6vOn+vBDhgwJ66m15VdeeWVYv+GGG0prjz32WDh27969YX3jxo1hfevWrWE90uo+/WWXXVZae+ONN8KxPT09TZkDz/xApgg/kCnCD2SK8AOZIvxApgg/kCnCD2Qq2ec3sxWSrpTU6+6zi+vGSfqDpGmStkq62t0/ad00j2+Nnhs/Nb6R8wWk+vgXXHBBWL/77rvD+tq1a0tr+/fvD8fu3r07rEe9ciles3///feHY++8886wnjqXwNixY8P6ddddV1pbuHBhOLZZannm/42kK466bpmkNe4+XdKa4nMAx5Fk+N19naSjfwVfJWllcXmlpEVNnheAFqv3Pf9Ed98uScXHeF8lAB2n5cf2m9lSSUtbfT8Ajk29z/w9ZjZZkoqPvWVf6O7L3X2uu8d/IQHQVvWG/1FJS4rLSyQ90pzpAGiXZPjN7AFJz0s6y8y6zexfJf1c0gIze0fSguJzAMcRa+X+61+7M7P23dlxpNHjABoxa9assP7kk0+G9TVr1oT1PXv2lNZ6e0vfLUqSZs6cGdYvuuiisP7ZZ5+V1kaPHh2OnTx5clh/7733wvo777wT1qM1+ddff304NsXd4x+oAkf4AZki/ECmCD+QKcIPZIrwA5ki/ECmvjGn7k61y1JbTaeWtka3n2rFNXp67BEjRoT1ffv2ldYmTpwYjl29enVYX7duXViPWnmS1N3dXVqbPXt2OPbiiy8O6zt27AjrX331VWkttVw4ahNK6S2+U63AadOmldZSLc633norrNeKZ34gU4QfyBThBzJF+IFMEX4gU4QfyBThBzL1jenzp3rtqV56o7ffiKFD429D1MeX4tNEP/XUU+HYTZs2hfUPP/wwrKd67Zdccklp7dxzzw3HpnrxqVOWn3zyyaW11PfztNNOC+svv/xyWE9tLx7d/uWXXx6Opc8PoCGEH8gU4QcyRfiBTBF+IFOEH8gU4Qcy1fY+f7QuPrXmPurNpvq2jdy2FM87dQxBo8cYzJ8/P6zfddddpbWPPvooHPvqq6+G9Wg9viQtWhTv0TpjxozS2rZt28Kxw4YNC+up4yOiNflTpkwJx6ZOvf3888+H9dTtR+v9U+emaBae+YFMEX4gU4QfyBThBzJF+IFMEX4gU4QfyFRyi24zWyHpSkm97j67uO4OSddJOrKY+zZ3/5/knWW6RfecOXPC+s033xzWL7zwwrD+yiuvlNY+/vjjcOz7778f1hcsWBDWzz///LC+efPm0trw4cPDsdF596V0Pzw6z0Fqvf2qVavC+kknnRTWp06dGtajuU+aNCkcm/qeNHOL7t9IumKQ6//T3ecU/5LBB9BZkuF393WS4lOqADjuNPKe/0Yze9XMVpjZqU2bEYC2qDf8v5L0HUlzJG2X9IuyLzSzpWbWZWZddd4XgBaoK/zu3uPufe5+WNKvJc0Lvna5u89197n1ThJA89UVfjObPODTxZJea850ALRLckmvmT0g6VJJp5tZt6SfSbrUzOZIcklbJf2khXME0ALJ8Lv7NYNcfW8L5qJRo0aF9Wj99oEDB8KxBw8eDOunnHJKWJ83r/Sdja699tpw7Nlnnx3We3p6wvoTTzwR1lPr2iOnn356WJ8+fXpY/+STT8L6iSeeWFpLHWOS+nkYMWJEWI+OYVi/fn04NvW4RMcQSOljFN5+++3S2syZM8OxZ555Zmkttc/CQBzhB2SK8AOZIvxApgg/kCnCD2SK8AOZauupu0eMGBGeynnjxo3h+DVr1pTWUm2jVKtv/PjxYX3IkCGltdSy2aeffjqsp9qUqeWjqa2qGxn7+uuvh/WzzjorrI8ZM6a0lmpRprb/fvbZZ8N6b29vaS11WvDU4xLdtpRugUb/t+hnTYrbiMeylTzP/ECmCD+QKcIPZIrwA5ki/ECmCD+QKcIPZKqtff7hw4eHy1u7uuIzfW3fvr20luoZp3qnqZ7y559/HtYjqaWnqaWrqeWhUW839f9O1Tdt2hTWU8cBnHpq+ekd9+/fH47dt29fWE8tw45On53q86e2dD906FBYHz16dFiPjt1Ifb937txZ97wG4pkfyBThBzJF+IFMEX4gU4QfyBThBzJF+IFMtX09/znnnFNaT/V99+zZU1pLrZ9Orc8eOXJkWB83blxpLTo9tZTuvab62TVso173faeOj0htNf3pp5+G9Wju0WMqSbNmzQrrqWMUou3BU9t7N3p8ROo4gb6+vtJa6twT0c8L6/kBJBF+IFOEH8gU4QcyRfiBTBF+IFOEH8hUss9vZlMl3SdpkqTDkpa7+3+b2ThJf5A0TdJWSVe7e9hsHzZsmCZNmlRanzZtWjiXqHcarfWX4p6vJO3atSusp9b7R1Jrx1M949RxBFGvPnXf0Xn1a6mnjgM477zzSmupYwzWrl0b1lPHbkTnUUgd/5B6zFPHpDTy8xIdAyAdWy8/Ussz/yFJt7j72ZL+SdINZjZL0jJJa9x9uqQ1xecAjhPJ8Lv7dnffUFzeI+lNSWdIukrSyuLLVkpa1KpJAmi+Y3rPb2bTJH1X0ouSJrr7dqn/F4SkCc2eHIDWqTn8ZjZK0kOSfuruNZ/QzsyWmlmXmXVFx+YDaK+awm9mw9Qf/N+6+6ri6h4zm1zUJ0sadOdCd1/u7nPdfW7qpIYA2icZfutf/nSvpDfd/ZcDSo9KWlJcXiLpkeZPD0CrWA3LRb8v6a+SNqm/1SdJt6n/ff8fJX1L0geSfujuuxO3Fd7ZokXx3wxvueWW0lqqbZQ6PXaqbRS1AlOn9U6dinn48OFhPdWui1paqf93Supxee6558L6gw8+WFp74YUXwrGpltf8+fPD+j333FNa27JlSzg29fP05ZdfhvW9e/eG9ehnYsqUKeHYxYsXl9a++OIL9fX1xeuVC8k+v7s/I6nsxuJHH0DH4gg/IFOEH8gU4QcyRfiBTBF+IFOEH8hUss/f1DtL9PkbkVoWO2fOnLA+b968sL5w4cLS2owZM8KxqVNUp5aHppYbHzhwoLS2evXqcOzjjz8e1lN9/CqNHTs2rEfHGERbZEvpPn7q1N+p8dGS4Q0bNoRjb7/99rDu7jX1+XnmBzJF+IFMEX4gU4QfyBThBzJF+IFMEX4gUx3V50/16lPru9F+qXMNNCK1VTUGR58fQIjwA5ki/ECmCD+QKcIPZIrwA5ki/ECmOqrPD6Bx9PkBhAg/kCnCD2SK8AOZIvxApgg/kCnCD2QqGX4zm2pma83sTTN73cxuKq6/w8w+MrONxb/yE9sD6DjJg3zMbLKkye6+wcxGS3pJ0iJJV0va6+7/UfOdcZAP0HK1HuQztIYb2i5pe3F5j5m9KemMxqYHoGrH9J7fzKZJ+q6kF4urbjSzV81shZmdWjJmqZl1mVlXQzMF0FQ1H9tvZqMk/UXSne6+yswmStopySX9m/rfGvxL4jZ42Q+0WK0v+2sKv5kNk/SYpCfd/ZeD1KdJeszdZyduh/ADLda0hT3Wvx3pvZLeHBj84g+BRyyW9NqxThJAdWr5a//3Jf1V0iZJh4urb5N0jaQ56n/Zv1XST4o/Dka3xTM/0GJNfdnfLIQfaD3W8wMIEX4gU4QfyBThBzJF+IFMEX4gU4QfyBThBzJF+IFMEX4gU4QfyBThBzJF+IFMEX4gU8kTeDbZTknvD/j89OK6TtSpc+vUeUnMrV7NnNs/1PqFbV3P/7U7N+ty97mVTSDQqXPr1HlJzK1eVc2Nl/1Apgg/kKmqw7+84vuPdOrcOnVeEnOrVyVzq/Q9P4DqVP3MD6AilYTfzK4ws7+Z2btmtqyKOZQxs61mtqnYebjSLcaKbdB6zey1AdeNM7M/m9k7xcdBt0mraG4dsXNzsLN0pY9dp+143faX/WY2RNLbkhZI6pa0XtI17v5GWydSwsy2Sprr7pX3hM3sYkl7Jd13ZDckM/t3Sbvd/efFL85T3f3WDpnbHTrGnZtbNLeynaV/rAofu2bueN0MVTzzz5P0rrtvdvevJP1e0lUVzKPjufs6SbuPuvoqSSuLyyvV/8PTdiVz6wjuvt3dNxSX90g6srN0pY9dMK9KVBH+MyR9OODzbnXWlt8u6Skze8nMllY9mUFMPLIzUvFxQsXzOVpy5+Z2Ompn6Y557OrZ8brZqgj/YLuJdFLL4Xvufr6kf5Z0Q/HyFrX5laTvqH8bt+2SflHlZIqdpR+S9FN3/7zKuQw0yLwqedyqCH+3pKkDPp8iaVsF8xiUu28rPvZKelj9b1M6Sc+RTVKLj70Vz+f/uXuPu/e5+2FJv1aFj12xs/RDkn7r7quKqyt/7AabV1WPWxXhXy9pupl928xOlPQjSY9WMI+vMbORxR9iZGYjJf1Anbf78KOSlhSXl0h6pMK5/J1O2bm5bGdpVfzYddqO15Uc5FO0Mv5L0hBJK9z9zrZPYhBm9o/qf7aX+lc8/q7KuZnZA5IuVf+qrx5JP5P0J0l/lPQtSR9I+qG7t/0PbyVzu1THuHNzi+ZWtrP0i6rwsWvmjtdNmQ9H+AF54gg/IFOEH8gU4QcyRfiBTBF+IFOEH8gU4QcyRfiBTP0fRmiaLxPxBrMAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.imshow(img, cmap='gray')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "feature_data = fashion_mnist.values[:,1:]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "target_data = fashion_mnist.values[:,0]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "svc = SVC(kernel='rbf', C=.1, gamma=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "trainX, testX, trainY, testY = train_test_split(feature_data, target_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.decomposition import PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = make_pipeline(MinMaxScaler(), PCA(n_components=64), svc)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = {\n", + " 'svc__C':[1,.1,.001,],\n", + " 'svc__gamma':[10]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_model = GridSearchCV(pipeline, param_grid=params)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}