commit-live-students · tracedence · Jan 24, 2019 · Jan 25, 2019 · Jan 27, 2019 · Jan 27, 2019
diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc
diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py
@@ -1,3 +1,4 @@
+# %load q01_outlier_removal/build.py
 # Default imports
 import pandas as pd
 
@@ -6,3 +7,19 @@
 
 
 # Write your Solution here:
+def outlier_removal(data):
+
+    loan_data = data
+    loan_data = loan_data[loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.97)]
+    loan_data = loan_data[loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.98)]
+    loan_data = loan_data[loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.97)]
+
+    return loan_data
+# loan_data = loan_data[loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.97)]
+# loan_data = loan_data[loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.98)]
+# loan_data = loan_data[loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.97)]
+# loan_data = loan_data[(loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.95)) & (loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.95)) & (loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.95))]
+# loan_data.shape
+
+
+
diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc
diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc
diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py
@@ -1,3 +1,4 @@
+# %load q02_data_cleaning_all/build.py
 # Default Imports
 import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__'))))
@@ -12,3 +13,19 @@
 
 
 # Write your solution here :
+
+def data_cleaning(data):
+
+    data['LoanAmount'].fillna(data['LoanAmount'].mean(),inplace = True)
+    cat_list = ['Gender','Married', 'Dependents', 'Self_Employed','Loan_Amount_Term','Credit_History']
+    for e in cat_list:
+        data[e].fillna(data[e].mode()[0],inplace = True)
+    X = data.iloc[:,:-1]
+    y = data.iloc[:,-1]
+    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state= 9)
+
+    return X,y, X_train, X_test, y_train, y_test
+
+data_cleaning(loan_data)
+
+
diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py
@@ -1,3 +1,4 @@
+# %load q02_data_cleaning_all_2/build.py
 # Default Imports
 import pandas as pd
 import numpy as np
@@ -11,3 +12,61 @@
 
 
 # Write your solution here :
+def data_cleaning_2(X_train, X_test, y_train, y_test):
+
+    num_col = X_train.select_dtypes(['int','float']).columns
+    X_train[num_col] = X_train[num_col].apply(np.sqrt)
+    X_train['Gender'].replace({'Male':0, 'Female':1},inplace = True)
+    X_train['Married'].replace({'Yes':1, 'No': 0}, inplace= True)
+    X_train['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True)
+    X_train['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True)
+    new_pro = pd.get_dummies(X_train['Property_Area']).reset_index()
+    X_train  = X_train.reset_index()
+    X_train = X_train.merge(new_pro, how='left', left_on = 'index', right_on = 'index')
+    X_train.index = X_train['index']
+    new_dependent = pd.get_dummies(X_train['Dependents']).reset_index()
+    X_train = X_train.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index')
+    X_train.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True)
+
+    X_train.rename(columns={'1':'Dependents_1', '2':'Dependents_2','3+':'Dependents_3'}, inplace = True)
+    X_train.rename(columns={'Semiurban':'Property_Area_Semiurban','Urban':'Property_Area_Urban'}, inplace = True)
+
+
+    num_col = X_test.select_dtypes(['int','float']).columns
+    X_test[num_col] = X_test[num_col].apply(np.sqrt)
+    X_test['Gender'].replace({'Male':0, 'Female':1},inplace = True)
+    X_test['Married'].replace({'Yes':1, 'No': 0}, inplace= True)
+    X_test['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True)
+    X_test['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True)
+    new_pro = pd.get_dummies(X_test['Property_Area']).reset_index()
+    X_test  = X_test.reset_index()
+    X_test = X_test.merge(new_pro, how='left', left_on = 'index', right_on = 'index')
+    X_test.index = X_test['index']
+    new_dependent = pd.get_dummies(X_test['Dependents']).reset_index()
+    X_test = X_test.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index')
+    X_test.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True)
+
+    X_test.rename(columns={'1':'Dependents_1', '2':'Dependents_2','3+':'Dependents_3'}, inplace = True)
+    X_test.rename(columns={'Semiurban':'Property_Area_Semiurban','Urban':'Property_Area_Urban'}, inplace = True)
+
+
+    return X_train, X_test, y_train, y_test
+
+# num_col = X_test.select_dtypes(['int','float']).columns
+# X_test[num_col] = X_test[num_col].apply(np.sqrt)
+# X_test['Gender'].replace({'Male':0, 'Female':1},inplace = True)
+# X_test['Married'].replace({'Yes':1, 'No': 0}, inplace= True)
+# X_test['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True)
+# X_test['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True)
+# new_pro = pd.get_dummies(X_test['Property_Area']).reset_index()
+# X_test  = X_test.reset_index()
+# X_test = X_test.merge(new_pro, how='left', left_on = 'index', right_on = 'index')
+# X_test.index = X_test['index']
+# new_dependent = pd.get_dummies(X_test['Dependents']).reset_index()
+# X_test = X_test.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index')
+# X_test.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True)
+
+
+
+
+
diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc
diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc
diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py
@@ -1,3 +1,4 @@
+# %load q03_logistic_regression/build.py
 # Default Imports
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
@@ -15,4 +16,20 @@
 
 
 # Write your solution code here:
+def logistic_regression(X_train, X_test, y_train, y_test):
+
+    scaler = StandardScaler()
+    #feature scaling on train
+    num_col = X_train.select_dtypes(['float']).columns
+    X_train[num_col] = scaler.fit_transform(X_train[num_col])
+    #feature scaling on test
+    num_col = X_test.select_dtypes(['float']).columns
+    X_test[num_col] = scaler.fit_transform(X_test[num_col])
+
+    logistic_model = LogisticRegression()
+    logistic_model.fit(X_train, y_train)
+    y_pred = logistic_model.predict(X_test)
+    cm = confusion_matrix(y_test, y_pred)
+    return cm
+
 
diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc