diff --git a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb index d4d68ed..0f854e2 100644 --- a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb +++ b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 36, "id": "312c95a1", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 37, "id": "aea1b45b", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 38, "id": "2ac5c17b", "metadata": {}, "outputs": [ @@ -275,7 +275,7 @@ "[5132 rows x 11 columns]" ] }, - "execution_count": 168, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 39, "id": "c4484360", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 169, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 40, "id": "ea7c6dcb", "metadata": {}, "outputs": [ @@ -437,7 +437,7 @@ "(5132, 11)" ] }, - "execution_count": 170, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -456,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 41, "id": "a3c2ff78", "metadata": {}, "outputs": [ @@ -499,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 42, "id": "a9cc7614", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 43, "id": "cdee3c52", "metadata": { "scrolled": true @@ -554,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 44, "id": "2f353609", "metadata": {}, "outputs": [ @@ -572,7 +572,7 @@ "Name: Age, dtype: float64" ] }, - "execution_count": 174, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -583,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 45, "id": "b7887949", "metadata": {}, "outputs": [ @@ -601,7 +601,7 @@ "Name: BMI, dtype: float64" ] }, - "execution_count": 175, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -612,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 46, "id": "fdf468b3", "metadata": {}, "outputs": [ @@ -630,7 +630,7 @@ "Name: Chol, dtype: float64" ] }, - "execution_count": 176, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +641,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 47, "id": "7877549d", "metadata": {}, "outputs": [ @@ -659,7 +659,7 @@ "Name: TG, dtype: float64" ] }, - "execution_count": 177, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -670,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 48, "id": "71ec5fcb", "metadata": {}, "outputs": [ @@ -688,7 +688,7 @@ "Name: HDL, dtype: float64" ] }, - "execution_count": 178, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 49, "id": "bbd3682c", "metadata": {}, "outputs": [ @@ -717,7 +717,7 @@ "Name: LDL, dtype: float64" ] }, - "execution_count": 179, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 50, "id": "c4603e52", "metadata": {}, "outputs": [ @@ -746,7 +746,7 @@ "Name: Cr, dtype: float64" ] }, - "execution_count": 180, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -757,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 51, "id": "f98b84c7", "metadata": {}, "outputs": [ @@ -775,7 +775,7 @@ "Name: BUN, dtype: float64" ] }, - "execution_count": 181, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -794,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 52, "id": "a9add765", "metadata": {}, "outputs": [ @@ -822,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 53, "id": "752e448c", "metadata": {}, "outputs": [], @@ -835,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 184, + "execution_count": 54, "id": "a8441035", "metadata": {}, "outputs": [], @@ -846,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 185, + "execution_count": 55, "id": "378ecf72", "metadata": {}, "outputs": [ @@ -892,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 186, + "execution_count": 56, "id": "39477c2a", "metadata": {}, "outputs": [ @@ -953,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 187, + "execution_count": 57, "id": "651dd699", "metadata": {}, "outputs": [], @@ -961,24 +961,25 @@ "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE,SelectKBest\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", + "from sklearn.metrics import accuracy_score, confusion_matrix\n", + "\n", "\n", "\n" ] }, { "cell_type": "code", - "execution_count": 188, + "execution_count": 58, "id": "e1801370", "metadata": {}, "outputs": [], "source": [ - "data_new=data_new.drop(['Gender'],axis=1)\n" + "data_new=data_new.drop('Gender',axis=1)\n" ] }, { "cell_type": "code", - "execution_count": 189, + "execution_count": 59, "id": "eb6127cc", "metadata": {}, "outputs": [ @@ -1088,7 +1089,7 @@ "4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 189, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1099,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 60, "id": "12304d13", "metadata": {}, "outputs": [], @@ -1110,28 +1111,25 @@ }, { "cell_type": "code", - "execution_count": 191, + "execution_count": 61, "id": "dfd08af6", "metadata": {}, "outputs": [], "source": [ "X_f = SelectKBest(f_classif, k=5).fit_transform(X, y)\n", "\n", - "# Assuming X has feature names and k=5\n", + "\n", "selector = SelectKBest(f_classif, k=5).fit(X, y)\n", "\n", - "# Check scikit-learn version (optional)\n", - "# import sklearn as skl\n", - "# print(f\"scikit-learn version: {skl.__version__}\")\n", "\n", - "if hasattr(selector, 'support_'): # For older versions\n", + "if hasattr(selector, 'support_'): \n", " support = selector.support_\n", "else:\n", - " support = selector.get_support() # For newer versions\n", + " support = selector.get_support() \n", + "\n", + "selected_features = X.columns[support] \n", "\n", - "selected_features = X.columns[support] # List of selected feature names\n", "\n", - "# Columns to filter out (original features not in selected_features)\n", "columns_to_filter = [col for col in X.columns if col not in selected_features]\n", "\n" ] @@ -1146,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 62, "id": "237cd728", "metadata": {}, "outputs": [ @@ -1156,7 +1154,7 @@ "['Chol', 'Cr', 'BUN']" ] }, - "execution_count": 192, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1175,17 +1173,17 @@ }, { "cell_type": "code", - "execution_count": 193, + "execution_count": 63, "id": "40e0c236", "metadata": {}, "outputs": [], "source": [ - "data_new=data_new.drop(['Chol','Cr','BUN'],axis=1)" + "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)" ] }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 64, "id": "6f5836ec", "metadata": {}, "outputs": [ @@ -1340,13 +1338,13 @@ "[5132 rows x 6 columns]" ] }, - "execution_count": 194, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data_new" + "data_new_kbest" ] }, { @@ -1359,9 +1357,351 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "67e27ab3", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.8198636806231743\n", + "[[536 68]\n", + " [117 306]]\n" + ] + } + ], + "source": [ + "y_f= data_new_kbest['Diagnosis']\n", + "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rfc = RandomForestClassifier(n_estimators=100) \n", + "rfc.fit(X_train, y_train)\n", + "y_pred = rfc.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e6b9595", + "metadata": {}, + "source": [ + "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "f63ffb4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of selected features: 8\n", + "Selected Features: Index(['Age', 'BMI', 'Chol', 'TG', 'HDL', 'LDL', 'Cr', 'BUN'], dtype='object')\n" + ] + } + ], + "source": [ + "from sklearn.feature_selection import RFECV\n", + "X_rfecv = data_new.drop('Diagnosis', axis=1)\n", + "y_rfecv = data_new['Diagnosis']\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=1000), cv=5) \n", + "rfecv.fit(X_train, y_train)\n", + "print(\"Number of selected features:\", rfecv.n_features_)\n", + "selected_features = X.columns[rfecv.support_]\n", + "print(\"Selected Features:\", selected_features)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "e9ef9701", + "metadata": {}, + "source": [ + "Applying Recursive Feature Elimination with Cross Validation to filter out features" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "22dbb42e", + "metadata": {}, + "outputs": [], + "source": [ + "data_new_rfecv=data_new\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "d96d1966", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeBMICholTGHDLLDLCrBUNDiagnosis
05024.04.200.902.401.4046.04.700
12623.03.701.401.102.1062.04.500
23321.04.901.000.802.0046.07.100
34521.02.901.001.001.5024.02.300
45024.03.601.300.902.1050.02.000
..............................
51275423.05.001.501.242.9877.03.501
51285022.04.372.091.372.2947.34.401
51296724.03.891.381.142.1770.64.731
51306029.05.911.291.732.8550.27.331
51313734.05.422.661.082.8775.54.611
\n", + "

5132 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " Age BMI Chol TG HDL LDL Cr BUN Diagnosis\n", + "0 50 24.0 4.20 0.90 2.40 1.40 46.0 4.70 0\n", + "1 26 23.0 3.70 1.40 1.10 2.10 62.0 4.50 0\n", + "2 33 21.0 4.90 1.00 0.80 2.00 46.0 7.10 0\n", + "3 45 21.0 2.90 1.00 1.00 1.50 24.0 2.30 0\n", + "4 50 24.0 3.60 1.30 0.90 2.10 50.0 2.00 0\n", + "... ... ... ... ... ... ... ... ... ...\n", + "5127 54 23.0 5.00 1.50 1.24 2.98 77.0 3.50 1\n", + "5128 50 22.0 4.37 2.09 1.37 2.29 47.3 4.40 1\n", + "5129 67 24.0 3.89 1.38 1.14 2.17 70.6 4.73 1\n", + "5130 60 29.0 5.91 1.29 1.73 2.85 50.2 7.33 1\n", + "5131 37 34.0 5.42 2.66 1.08 2.87 75.5 4.61 1\n", + "\n", + "[5132 rows x 9 columns]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_new_rfecv" + ] + }, + { + "cell_type": "markdown", + "id": "df381a5f", + "metadata": {}, + "source": [ + "New dataset generated after using RFECV" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "084425a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.818889970788705\n", + "[[539 65]\n", + " [121 302]]\n" + ] + } + ], + "source": [ + "y_rfecv= data_new_rfecv['Diagnosis']\n", + "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rfc = RandomForestClassifier(n_estimators=100) \n", + "rfc.fit(X_train, y_train)\n", + "y_pred = rfc.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)" + ] + }, + { + "cell_type": "markdown", + "id": "a52ed238", + "metadata": {}, + "source": [ + "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix" + ] + }, + { + "cell_type": "markdown", + "id": "59349ad6", + "metadata": {}, + "source": [ + "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82916004", + "metadata": {}, "outputs": [], "source": [] } diff --git a/Diabetes Classification/diabetesclassification.ipynb b/Diabetes Classification/diabetesclassification.ipynb index d4d68ed..0f854e2 100644 --- a/Diabetes Classification/diabetesclassification.ipynb +++ b/Diabetes Classification/diabetesclassification.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 36, "id": "312c95a1", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 37, "id": "aea1b45b", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 38, "id": "2ac5c17b", "metadata": {}, "outputs": [ @@ -275,7 +275,7 @@ "[5132 rows x 11 columns]" ] }, - "execution_count": 168, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -286,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 39, "id": "c4484360", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 169, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 40, "id": "ea7c6dcb", "metadata": {}, "outputs": [ @@ -437,7 +437,7 @@ "(5132, 11)" ] }, - "execution_count": 170, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -456,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 41, "id": "a3c2ff78", "metadata": {}, "outputs": [ @@ -499,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 42, "id": "a9cc7614", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 43, "id": "cdee3c52", "metadata": { "scrolled": true @@ -554,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 44, "id": "2f353609", "metadata": {}, "outputs": [ @@ -572,7 +572,7 @@ "Name: Age, dtype: float64" ] }, - "execution_count": 174, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -583,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 45, "id": "b7887949", "metadata": {}, "outputs": [ @@ -601,7 +601,7 @@ "Name: BMI, dtype: float64" ] }, - "execution_count": 175, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -612,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 46, "id": "fdf468b3", "metadata": {}, "outputs": [ @@ -630,7 +630,7 @@ "Name: Chol, dtype: float64" ] }, - "execution_count": 176, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +641,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 47, "id": "7877549d", "metadata": {}, "outputs": [ @@ -659,7 +659,7 @@ "Name: TG, dtype: float64" ] }, - "execution_count": 177, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -670,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 48, "id": "71ec5fcb", "metadata": {}, "outputs": [ @@ -688,7 +688,7 @@ "Name: HDL, dtype: float64" ] }, - "execution_count": 178, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 49, "id": "bbd3682c", "metadata": {}, "outputs": [ @@ -717,7 +717,7 @@ "Name: LDL, dtype: float64" ] }, - "execution_count": 179, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 50, "id": "c4603e52", "metadata": {}, "outputs": [ @@ -746,7 +746,7 @@ "Name: Cr, dtype: float64" ] }, - "execution_count": 180, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -757,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 51, "id": "f98b84c7", "metadata": {}, "outputs": [ @@ -775,7 +775,7 @@ "Name: BUN, dtype: float64" ] }, - "execution_count": 181, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -794,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 52, "id": "a9add765", "metadata": {}, "outputs": [ @@ -822,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 53, "id": "752e448c", "metadata": {}, "outputs": [], @@ -835,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 184, + "execution_count": 54, "id": "a8441035", "metadata": {}, "outputs": [], @@ -846,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 185, + "execution_count": 55, "id": "378ecf72", "metadata": {}, "outputs": [ @@ -892,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 186, + "execution_count": 56, "id": "39477c2a", "metadata": {}, "outputs": [ @@ -953,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 187, + "execution_count": 57, "id": "651dd699", "metadata": {}, "outputs": [], @@ -961,24 +961,25 @@ "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE,SelectKBest\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", + "from sklearn.metrics import accuracy_score, confusion_matrix\n", + "\n", "\n", "\n" ] }, { "cell_type": "code", - "execution_count": 188, + "execution_count": 58, "id": "e1801370", "metadata": {}, "outputs": [], "source": [ - "data_new=data_new.drop(['Gender'],axis=1)\n" + "data_new=data_new.drop('Gender',axis=1)\n" ] }, { "cell_type": "code", - "execution_count": 189, + "execution_count": 59, "id": "eb6127cc", "metadata": {}, "outputs": [ @@ -1088,7 +1089,7 @@ "4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0" ] }, - "execution_count": 189, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1099,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 60, "id": "12304d13", "metadata": {}, "outputs": [], @@ -1110,28 +1111,25 @@ }, { "cell_type": "code", - "execution_count": 191, + "execution_count": 61, "id": "dfd08af6", "metadata": {}, "outputs": [], "source": [ "X_f = SelectKBest(f_classif, k=5).fit_transform(X, y)\n", "\n", - "# Assuming X has feature names and k=5\n", + "\n", "selector = SelectKBest(f_classif, k=5).fit(X, y)\n", "\n", - "# Check scikit-learn version (optional)\n", - "# import sklearn as skl\n", - "# print(f\"scikit-learn version: {skl.__version__}\")\n", "\n", - "if hasattr(selector, 'support_'): # For older versions\n", + "if hasattr(selector, 'support_'): \n", " support = selector.support_\n", "else:\n", - " support = selector.get_support() # For newer versions\n", + " support = selector.get_support() \n", + "\n", + "selected_features = X.columns[support] \n", "\n", - "selected_features = X.columns[support] # List of selected feature names\n", "\n", - "# Columns to filter out (original features not in selected_features)\n", "columns_to_filter = [col for col in X.columns if col not in selected_features]\n", "\n" ] @@ -1146,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 62, "id": "237cd728", "metadata": {}, "outputs": [ @@ -1156,7 +1154,7 @@ "['Chol', 'Cr', 'BUN']" ] }, - "execution_count": 192, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1175,17 +1173,17 @@ }, { "cell_type": "code", - "execution_count": 193, + "execution_count": 63, "id": "40e0c236", "metadata": {}, "outputs": [], "source": [ - "data_new=data_new.drop(['Chol','Cr','BUN'],axis=1)" + "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)" ] }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 64, "id": "6f5836ec", "metadata": {}, "outputs": [ @@ -1340,13 +1338,13 @@ "[5132 rows x 6 columns]" ] }, - "execution_count": 194, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data_new" + "data_new_kbest" ] }, { @@ -1359,9 +1357,351 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "67e27ab3", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.8198636806231743\n", + "[[536 68]\n", + " [117 306]]\n" + ] + } + ], + "source": [ + "y_f= data_new_kbest['Diagnosis']\n", + "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rfc = RandomForestClassifier(n_estimators=100) \n", + "rfc.fit(X_train, y_train)\n", + "y_pred = rfc.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e6b9595", + "metadata": {}, + "source": [ + "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "f63ffb4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of selected features: 8\n", + "Selected Features: Index(['Age', 'BMI', 'Chol', 'TG', 'HDL', 'LDL', 'Cr', 'BUN'], dtype='object')\n" + ] + } + ], + "source": [ + "from sklearn.feature_selection import RFECV\n", + "X_rfecv = data_new.drop('Diagnosis', axis=1)\n", + "y_rfecv = data_new['Diagnosis']\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=1000), cv=5) \n", + "rfecv.fit(X_train, y_train)\n", + "print(\"Number of selected features:\", rfecv.n_features_)\n", + "selected_features = X.columns[rfecv.support_]\n", + "print(\"Selected Features:\", selected_features)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "e9ef9701", + "metadata": {}, + "source": [ + "Applying Recursive Feature Elimination with Cross Validation to filter out features" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "22dbb42e", + "metadata": {}, + "outputs": [], + "source": [ + "data_new_rfecv=data_new\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "d96d1966", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeBMICholTGHDLLDLCrBUNDiagnosis
05024.04.200.902.401.4046.04.700
12623.03.701.401.102.1062.04.500
23321.04.901.000.802.0046.07.100
34521.02.901.001.001.5024.02.300
45024.03.601.300.902.1050.02.000
..............................
51275423.05.001.501.242.9877.03.501
51285022.04.372.091.372.2947.34.401
51296724.03.891.381.142.1770.64.731
51306029.05.911.291.732.8550.27.331
51313734.05.422.661.082.8775.54.611
\n", + "

5132 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " Age BMI Chol TG HDL LDL Cr BUN Diagnosis\n", + "0 50 24.0 4.20 0.90 2.40 1.40 46.0 4.70 0\n", + "1 26 23.0 3.70 1.40 1.10 2.10 62.0 4.50 0\n", + "2 33 21.0 4.90 1.00 0.80 2.00 46.0 7.10 0\n", + "3 45 21.0 2.90 1.00 1.00 1.50 24.0 2.30 0\n", + "4 50 24.0 3.60 1.30 0.90 2.10 50.0 2.00 0\n", + "... ... ... ... ... ... ... ... ... ...\n", + "5127 54 23.0 5.00 1.50 1.24 2.98 77.0 3.50 1\n", + "5128 50 22.0 4.37 2.09 1.37 2.29 47.3 4.40 1\n", + "5129 67 24.0 3.89 1.38 1.14 2.17 70.6 4.73 1\n", + "5130 60 29.0 5.91 1.29 1.73 2.85 50.2 7.33 1\n", + "5131 37 34.0 5.42 2.66 1.08 2.87 75.5 4.61 1\n", + "\n", + "[5132 rows x 9 columns]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_new_rfecv" + ] + }, + { + "cell_type": "markdown", + "id": "df381a5f", + "metadata": {}, + "source": [ + "New dataset generated after using RFECV" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "084425a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.818889970788705\n", + "[[539 65]\n", + " [121 302]]\n" + ] + } + ], + "source": [ + "y_rfecv= data_new_rfecv['Diagnosis']\n", + "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n", + "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "rfc = RandomForestClassifier(n_estimators=100) \n", + "rfc.fit(X_train, y_train)\n", + "y_pred = rfc.predict(X_test)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "print(cm)" + ] + }, + { + "cell_type": "markdown", + "id": "a52ed238", + "metadata": {}, + "source": [ + "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix" + ] + }, + { + "cell_type": "markdown", + "id": "59349ad6", + "metadata": {}, + "source": [ + "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82916004", + "metadata": {}, "outputs": [], "source": [] }