diff --git a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb
index d4d68ed..0f854e2 100644
--- a/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb
+++ b/Diabetes Classification/.ipynb_checkpoints/diabetesclassification-checkpoint.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 166,
+ "execution_count": 36,
"id": "312c95a1",
"metadata": {},
"outputs": [],
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": 167,
+ "execution_count": 37,
"id": "aea1b45b",
"metadata": {},
"outputs": [],
@@ -47,7 +47,7 @@
},
{
"cell_type": "code",
- "execution_count": 168,
+ "execution_count": 38,
"id": "2ac5c17b",
"metadata": {},
"outputs": [
@@ -275,7 +275,7 @@
"[5132 rows x 11 columns]"
]
},
- "execution_count": 168,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@@ -286,7 +286,7 @@
},
{
"cell_type": "code",
- "execution_count": 169,
+ "execution_count": 39,
"id": "c4484360",
"metadata": {},
"outputs": [
@@ -408,7 +408,7 @@
"4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0"
]
},
- "execution_count": 169,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -427,7 +427,7 @@
},
{
"cell_type": "code",
- "execution_count": 170,
+ "execution_count": 40,
"id": "ea7c6dcb",
"metadata": {},
"outputs": [
@@ -437,7 +437,7 @@
"(5132, 11)"
]
},
- "execution_count": 170,
+ "execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -456,7 +456,7 @@
},
{
"cell_type": "code",
- "execution_count": 171,
+ "execution_count": 41,
"id": "a3c2ff78",
"metadata": {},
"outputs": [
@@ -499,7 +499,7 @@
},
{
"cell_type": "code",
- "execution_count": 172,
+ "execution_count": 42,
"id": "a9cc7614",
"metadata": {},
"outputs": [],
@@ -510,7 +510,7 @@
},
{
"cell_type": "code",
- "execution_count": 173,
+ "execution_count": 43,
"id": "cdee3c52",
"metadata": {
"scrolled": true
@@ -554,7 +554,7 @@
},
{
"cell_type": "code",
- "execution_count": 174,
+ "execution_count": 44,
"id": "2f353609",
"metadata": {},
"outputs": [
@@ -572,7 +572,7 @@
"Name: Age, dtype: float64"
]
},
- "execution_count": 174,
+ "execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
@@ -583,7 +583,7 @@
},
{
"cell_type": "code",
- "execution_count": 175,
+ "execution_count": 45,
"id": "b7887949",
"metadata": {},
"outputs": [
@@ -601,7 +601,7 @@
"Name: BMI, dtype: float64"
]
},
- "execution_count": 175,
+ "execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@@ -612,7 +612,7 @@
},
{
"cell_type": "code",
- "execution_count": 176,
+ "execution_count": 46,
"id": "fdf468b3",
"metadata": {},
"outputs": [
@@ -630,7 +630,7 @@
"Name: Chol, dtype: float64"
]
},
- "execution_count": 176,
+ "execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
@@ -641,7 +641,7 @@
},
{
"cell_type": "code",
- "execution_count": 177,
+ "execution_count": 47,
"id": "7877549d",
"metadata": {},
"outputs": [
@@ -659,7 +659,7 @@
"Name: TG, dtype: float64"
]
},
- "execution_count": 177,
+ "execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
@@ -670,7 +670,7 @@
},
{
"cell_type": "code",
- "execution_count": 178,
+ "execution_count": 48,
"id": "71ec5fcb",
"metadata": {},
"outputs": [
@@ -688,7 +688,7 @@
"Name: HDL, dtype: float64"
]
},
- "execution_count": 178,
+ "execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@@ -699,7 +699,7 @@
},
{
"cell_type": "code",
- "execution_count": 179,
+ "execution_count": 49,
"id": "bbd3682c",
"metadata": {},
"outputs": [
@@ -717,7 +717,7 @@
"Name: LDL, dtype: float64"
]
},
- "execution_count": 179,
+ "execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
@@ -728,7 +728,7 @@
},
{
"cell_type": "code",
- "execution_count": 180,
+ "execution_count": 50,
"id": "c4603e52",
"metadata": {},
"outputs": [
@@ -746,7 +746,7 @@
"Name: Cr, dtype: float64"
]
},
- "execution_count": 180,
+ "execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
@@ -757,7 +757,7 @@
},
{
"cell_type": "code",
- "execution_count": 181,
+ "execution_count": 51,
"id": "f98b84c7",
"metadata": {},
"outputs": [
@@ -775,7 +775,7 @@
"Name: BUN, dtype: float64"
]
},
- "execution_count": 181,
+ "execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@@ -794,7 +794,7 @@
},
{
"cell_type": "code",
- "execution_count": 182,
+ "execution_count": 52,
"id": "a9add765",
"metadata": {},
"outputs": [
@@ -822,7 +822,7 @@
},
{
"cell_type": "code",
- "execution_count": 183,
+ "execution_count": 53,
"id": "752e448c",
"metadata": {},
"outputs": [],
@@ -835,7 +835,7 @@
},
{
"cell_type": "code",
- "execution_count": 184,
+ "execution_count": 54,
"id": "a8441035",
"metadata": {},
"outputs": [],
@@ -846,7 +846,7 @@
},
{
"cell_type": "code",
- "execution_count": 185,
+ "execution_count": 55,
"id": "378ecf72",
"metadata": {},
"outputs": [
@@ -892,7 +892,7 @@
},
{
"cell_type": "code",
- "execution_count": 186,
+ "execution_count": 56,
"id": "39477c2a",
"metadata": {},
"outputs": [
@@ -953,7 +953,7 @@
},
{
"cell_type": "code",
- "execution_count": 187,
+ "execution_count": 57,
"id": "651dd699",
"metadata": {},
"outputs": [],
@@ -961,24 +961,25 @@
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE,SelectKBest\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
+ "from sklearn.metrics import accuracy_score, confusion_matrix\n",
+ "\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
- "execution_count": 188,
+ "execution_count": 58,
"id": "e1801370",
"metadata": {},
"outputs": [],
"source": [
- "data_new=data_new.drop(['Gender'],axis=1)\n"
+ "data_new=data_new.drop('Gender',axis=1)\n"
]
},
{
"cell_type": "code",
- "execution_count": 189,
+ "execution_count": 59,
"id": "eb6127cc",
"metadata": {},
"outputs": [
@@ -1088,7 +1089,7 @@
"4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0"
]
},
- "execution_count": 189,
+ "execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
@@ -1099,7 +1100,7 @@
},
{
"cell_type": "code",
- "execution_count": 190,
+ "execution_count": 60,
"id": "12304d13",
"metadata": {},
"outputs": [],
@@ -1110,28 +1111,25 @@
},
{
"cell_type": "code",
- "execution_count": 191,
+ "execution_count": 61,
"id": "dfd08af6",
"metadata": {},
"outputs": [],
"source": [
"X_f = SelectKBest(f_classif, k=5).fit_transform(X, y)\n",
"\n",
- "# Assuming X has feature names and k=5\n",
+ "\n",
"selector = SelectKBest(f_classif, k=5).fit(X, y)\n",
"\n",
- "# Check scikit-learn version (optional)\n",
- "# import sklearn as skl\n",
- "# print(f\"scikit-learn version: {skl.__version__}\")\n",
"\n",
- "if hasattr(selector, 'support_'): # For older versions\n",
+ "if hasattr(selector, 'support_'): \n",
" support = selector.support_\n",
"else:\n",
- " support = selector.get_support() # For newer versions\n",
+ " support = selector.get_support() \n",
+ "\n",
+ "selected_features = X.columns[support] \n",
"\n",
- "selected_features = X.columns[support] # List of selected feature names\n",
"\n",
- "# Columns to filter out (original features not in selected_features)\n",
"columns_to_filter = [col for col in X.columns if col not in selected_features]\n",
"\n"
]
@@ -1146,7 +1144,7 @@
},
{
"cell_type": "code",
- "execution_count": 192,
+ "execution_count": 62,
"id": "237cd728",
"metadata": {},
"outputs": [
@@ -1156,7 +1154,7 @@
"['Chol', 'Cr', 'BUN']"
]
},
- "execution_count": 192,
+ "execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
@@ -1175,17 +1173,17 @@
},
{
"cell_type": "code",
- "execution_count": 193,
+ "execution_count": 63,
"id": "40e0c236",
"metadata": {},
"outputs": [],
"source": [
- "data_new=data_new.drop(['Chol','Cr','BUN'],axis=1)"
+ "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)"
]
},
{
"cell_type": "code",
- "execution_count": 194,
+ "execution_count": 64,
"id": "6f5836ec",
"metadata": {},
"outputs": [
@@ -1340,13 +1338,13 @@
"[5132 rows x 6 columns]"
]
},
- "execution_count": 194,
+ "execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "data_new"
+ "data_new_kbest"
]
},
{
@@ -1359,9 +1357,351 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 66,
"id": "67e27ab3",
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.8198636806231743\n",
+ "[[536 68]\n",
+ " [117 306]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_f= data_new_kbest['Diagnosis']\n",
+ "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "rfc = RandomForestClassifier(n_estimators=100) \n",
+ "rfc.fit(X_train, y_train)\n",
+ "y_pred = rfc.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Accuracy:\", accuracy)\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "print(cm)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e6b9595",
+ "metadata": {},
+ "source": [
+ "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "f63ffb4d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of selected features: 8\n",
+ "Selected Features: Index(['Age', 'BMI', 'Chol', 'TG', 'HDL', 'LDL', 'Cr', 'BUN'], dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.feature_selection import RFECV\n",
+ "X_rfecv = data_new.drop('Diagnosis', axis=1)\n",
+ "y_rfecv = data_new['Diagnosis']\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n",
+ "rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=1000), cv=5) \n",
+ "rfecv.fit(X_train, y_train)\n",
+ "print(\"Number of selected features:\", rfecv.n_features_)\n",
+ "selected_features = X.columns[rfecv.support_]\n",
+ "print(\"Selected Features:\", selected_features)\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9ef9701",
+ "metadata": {},
+ "source": [
+ "Applying Recursive Feature Elimination with Cross Validation to filter out features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "22dbb42e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_new_rfecv=data_new\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "d96d1966",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " BMI | \n",
+ " Chol | \n",
+ " TG | \n",
+ " HDL | \n",
+ " LDL | \n",
+ " Cr | \n",
+ " BUN | \n",
+ " Diagnosis | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 50 | \n",
+ " 24.0 | \n",
+ " 4.20 | \n",
+ " 0.90 | \n",
+ " 2.40 | \n",
+ " 1.40 | \n",
+ " 46.0 | \n",
+ " 4.70 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 26 | \n",
+ " 23.0 | \n",
+ " 3.70 | \n",
+ " 1.40 | \n",
+ " 1.10 | \n",
+ " 2.10 | \n",
+ " 62.0 | \n",
+ " 4.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 33 | \n",
+ " 21.0 | \n",
+ " 4.90 | \n",
+ " 1.00 | \n",
+ " 0.80 | \n",
+ " 2.00 | \n",
+ " 46.0 | \n",
+ " 7.10 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 45 | \n",
+ " 21.0 | \n",
+ " 2.90 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 1.50 | \n",
+ " 24.0 | \n",
+ " 2.30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 50 | \n",
+ " 24.0 | \n",
+ " 3.60 | \n",
+ " 1.30 | \n",
+ " 0.90 | \n",
+ " 2.10 | \n",
+ " 50.0 | \n",
+ " 2.00 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 5127 | \n",
+ " 54 | \n",
+ " 23.0 | \n",
+ " 5.00 | \n",
+ " 1.50 | \n",
+ " 1.24 | \n",
+ " 2.98 | \n",
+ " 77.0 | \n",
+ " 3.50 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5128 | \n",
+ " 50 | \n",
+ " 22.0 | \n",
+ " 4.37 | \n",
+ " 2.09 | \n",
+ " 1.37 | \n",
+ " 2.29 | \n",
+ " 47.3 | \n",
+ " 4.40 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5129 | \n",
+ " 67 | \n",
+ " 24.0 | \n",
+ " 3.89 | \n",
+ " 1.38 | \n",
+ " 1.14 | \n",
+ " 2.17 | \n",
+ " 70.6 | \n",
+ " 4.73 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5130 | \n",
+ " 60 | \n",
+ " 29.0 | \n",
+ " 5.91 | \n",
+ " 1.29 | \n",
+ " 1.73 | \n",
+ " 2.85 | \n",
+ " 50.2 | \n",
+ " 7.33 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5131 | \n",
+ " 37 | \n",
+ " 34.0 | \n",
+ " 5.42 | \n",
+ " 2.66 | \n",
+ " 1.08 | \n",
+ " 2.87 | \n",
+ " 75.5 | \n",
+ " 4.61 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5132 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age BMI Chol TG HDL LDL Cr BUN Diagnosis\n",
+ "0 50 24.0 4.20 0.90 2.40 1.40 46.0 4.70 0\n",
+ "1 26 23.0 3.70 1.40 1.10 2.10 62.0 4.50 0\n",
+ "2 33 21.0 4.90 1.00 0.80 2.00 46.0 7.10 0\n",
+ "3 45 21.0 2.90 1.00 1.00 1.50 24.0 2.30 0\n",
+ "4 50 24.0 3.60 1.30 0.90 2.10 50.0 2.00 0\n",
+ "... ... ... ... ... ... ... ... ... ...\n",
+ "5127 54 23.0 5.00 1.50 1.24 2.98 77.0 3.50 1\n",
+ "5128 50 22.0 4.37 2.09 1.37 2.29 47.3 4.40 1\n",
+ "5129 67 24.0 3.89 1.38 1.14 2.17 70.6 4.73 1\n",
+ "5130 60 29.0 5.91 1.29 1.73 2.85 50.2 7.33 1\n",
+ "5131 37 34.0 5.42 2.66 1.08 2.87 75.5 4.61 1\n",
+ "\n",
+ "[5132 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_new_rfecv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df381a5f",
+ "metadata": {},
+ "source": [
+ "New dataset generated after using RFECV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "084425a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.818889970788705\n",
+ "[[539 65]\n",
+ " [121 302]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_rfecv= data_new_rfecv['Diagnosis']\n",
+ "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "rfc = RandomForestClassifier(n_estimators=100) \n",
+ "rfc.fit(X_train, y_train)\n",
+ "y_pred = rfc.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Accuracy:\", accuracy)\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "print(cm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a52ed238",
+ "metadata": {},
+ "source": [
+ "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59349ad6",
+ "metadata": {},
+ "source": [
+ "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "82916004",
+ "metadata": {},
"outputs": [],
"source": []
}
diff --git a/Diabetes Classification/diabetesclassification.ipynb b/Diabetes Classification/diabetesclassification.ipynb
index d4d68ed..0f854e2 100644
--- a/Diabetes Classification/diabetesclassification.ipynb
+++ b/Diabetes Classification/diabetesclassification.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 166,
+ "execution_count": 36,
"id": "312c95a1",
"metadata": {},
"outputs": [],
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": 167,
+ "execution_count": 37,
"id": "aea1b45b",
"metadata": {},
"outputs": [],
@@ -47,7 +47,7 @@
},
{
"cell_type": "code",
- "execution_count": 168,
+ "execution_count": 38,
"id": "2ac5c17b",
"metadata": {},
"outputs": [
@@ -275,7 +275,7 @@
"[5132 rows x 11 columns]"
]
},
- "execution_count": 168,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@@ -286,7 +286,7 @@
},
{
"cell_type": "code",
- "execution_count": 169,
+ "execution_count": 39,
"id": "c4484360",
"metadata": {},
"outputs": [
@@ -408,7 +408,7 @@
"4 4 50 F 24 3.6 1.3 0.9 2.1 50.0 2.0 0"
]
},
- "execution_count": 169,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -427,7 +427,7 @@
},
{
"cell_type": "code",
- "execution_count": 170,
+ "execution_count": 40,
"id": "ea7c6dcb",
"metadata": {},
"outputs": [
@@ -437,7 +437,7 @@
"(5132, 11)"
]
},
- "execution_count": 170,
+ "execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -456,7 +456,7 @@
},
{
"cell_type": "code",
- "execution_count": 171,
+ "execution_count": 41,
"id": "a3c2ff78",
"metadata": {},
"outputs": [
@@ -499,7 +499,7 @@
},
{
"cell_type": "code",
- "execution_count": 172,
+ "execution_count": 42,
"id": "a9cc7614",
"metadata": {},
"outputs": [],
@@ -510,7 +510,7 @@
},
{
"cell_type": "code",
- "execution_count": 173,
+ "execution_count": 43,
"id": "cdee3c52",
"metadata": {
"scrolled": true
@@ -554,7 +554,7 @@
},
{
"cell_type": "code",
- "execution_count": 174,
+ "execution_count": 44,
"id": "2f353609",
"metadata": {},
"outputs": [
@@ -572,7 +572,7 @@
"Name: Age, dtype: float64"
]
},
- "execution_count": 174,
+ "execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
@@ -583,7 +583,7 @@
},
{
"cell_type": "code",
- "execution_count": 175,
+ "execution_count": 45,
"id": "b7887949",
"metadata": {},
"outputs": [
@@ -601,7 +601,7 @@
"Name: BMI, dtype: float64"
]
},
- "execution_count": 175,
+ "execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@@ -612,7 +612,7 @@
},
{
"cell_type": "code",
- "execution_count": 176,
+ "execution_count": 46,
"id": "fdf468b3",
"metadata": {},
"outputs": [
@@ -630,7 +630,7 @@
"Name: Chol, dtype: float64"
]
},
- "execution_count": 176,
+ "execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
@@ -641,7 +641,7 @@
},
{
"cell_type": "code",
- "execution_count": 177,
+ "execution_count": 47,
"id": "7877549d",
"metadata": {},
"outputs": [
@@ -659,7 +659,7 @@
"Name: TG, dtype: float64"
]
},
- "execution_count": 177,
+ "execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
@@ -670,7 +670,7 @@
},
{
"cell_type": "code",
- "execution_count": 178,
+ "execution_count": 48,
"id": "71ec5fcb",
"metadata": {},
"outputs": [
@@ -688,7 +688,7 @@
"Name: HDL, dtype: float64"
]
},
- "execution_count": 178,
+ "execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@@ -699,7 +699,7 @@
},
{
"cell_type": "code",
- "execution_count": 179,
+ "execution_count": 49,
"id": "bbd3682c",
"metadata": {},
"outputs": [
@@ -717,7 +717,7 @@
"Name: LDL, dtype: float64"
]
},
- "execution_count": 179,
+ "execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
@@ -728,7 +728,7 @@
},
{
"cell_type": "code",
- "execution_count": 180,
+ "execution_count": 50,
"id": "c4603e52",
"metadata": {},
"outputs": [
@@ -746,7 +746,7 @@
"Name: Cr, dtype: float64"
]
},
- "execution_count": 180,
+ "execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
@@ -757,7 +757,7 @@
},
{
"cell_type": "code",
- "execution_count": 181,
+ "execution_count": 51,
"id": "f98b84c7",
"metadata": {},
"outputs": [
@@ -775,7 +775,7 @@
"Name: BUN, dtype: float64"
]
},
- "execution_count": 181,
+ "execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@@ -794,7 +794,7 @@
},
{
"cell_type": "code",
- "execution_count": 182,
+ "execution_count": 52,
"id": "a9add765",
"metadata": {},
"outputs": [
@@ -822,7 +822,7 @@
},
{
"cell_type": "code",
- "execution_count": 183,
+ "execution_count": 53,
"id": "752e448c",
"metadata": {},
"outputs": [],
@@ -835,7 +835,7 @@
},
{
"cell_type": "code",
- "execution_count": 184,
+ "execution_count": 54,
"id": "a8441035",
"metadata": {},
"outputs": [],
@@ -846,7 +846,7 @@
},
{
"cell_type": "code",
- "execution_count": 185,
+ "execution_count": 55,
"id": "378ecf72",
"metadata": {},
"outputs": [
@@ -892,7 +892,7 @@
},
{
"cell_type": "code",
- "execution_count": 186,
+ "execution_count": 56,
"id": "39477c2a",
"metadata": {},
"outputs": [
@@ -953,7 +953,7 @@
},
{
"cell_type": "code",
- "execution_count": 187,
+ "execution_count": 57,
"id": "651dd699",
"metadata": {},
"outputs": [],
@@ -961,24 +961,25 @@
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE,SelectKBest\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
+ "from sklearn.metrics import accuracy_score, confusion_matrix\n",
+ "\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
- "execution_count": 188,
+ "execution_count": 58,
"id": "e1801370",
"metadata": {},
"outputs": [],
"source": [
- "data_new=data_new.drop(['Gender'],axis=1)\n"
+ "data_new=data_new.drop('Gender',axis=1)\n"
]
},
{
"cell_type": "code",
- "execution_count": 189,
+ "execution_count": 59,
"id": "eb6127cc",
"metadata": {},
"outputs": [
@@ -1088,7 +1089,7 @@
"4 50 24.0 3.6 1.3 0.9 2.1 50.0 2.0 0"
]
},
- "execution_count": 189,
+ "execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
@@ -1099,7 +1100,7 @@
},
{
"cell_type": "code",
- "execution_count": 190,
+ "execution_count": 60,
"id": "12304d13",
"metadata": {},
"outputs": [],
@@ -1110,28 +1111,25 @@
},
{
"cell_type": "code",
- "execution_count": 191,
+ "execution_count": 61,
"id": "dfd08af6",
"metadata": {},
"outputs": [],
"source": [
"X_f = SelectKBest(f_classif, k=5).fit_transform(X, y)\n",
"\n",
- "# Assuming X has feature names and k=5\n",
+ "\n",
"selector = SelectKBest(f_classif, k=5).fit(X, y)\n",
"\n",
- "# Check scikit-learn version (optional)\n",
- "# import sklearn as skl\n",
- "# print(f\"scikit-learn version: {skl.__version__}\")\n",
"\n",
- "if hasattr(selector, 'support_'): # For older versions\n",
+ "if hasattr(selector, 'support_'): \n",
" support = selector.support_\n",
"else:\n",
- " support = selector.get_support() # For newer versions\n",
+ " support = selector.get_support() \n",
+ "\n",
+ "selected_features = X.columns[support] \n",
"\n",
- "selected_features = X.columns[support] # List of selected feature names\n",
"\n",
- "# Columns to filter out (original features not in selected_features)\n",
"columns_to_filter = [col for col in X.columns if col not in selected_features]\n",
"\n"
]
@@ -1146,7 +1144,7 @@
},
{
"cell_type": "code",
- "execution_count": 192,
+ "execution_count": 62,
"id": "237cd728",
"metadata": {},
"outputs": [
@@ -1156,7 +1154,7 @@
"['Chol', 'Cr', 'BUN']"
]
},
- "execution_count": 192,
+ "execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
@@ -1175,17 +1173,17 @@
},
{
"cell_type": "code",
- "execution_count": 193,
+ "execution_count": 63,
"id": "40e0c236",
"metadata": {},
"outputs": [],
"source": [
- "data_new=data_new.drop(['Chol','Cr','BUN'],axis=1)"
+ "data_new_kbest=data_new.drop(['Chol','Cr','BUN'],axis=1)"
]
},
{
"cell_type": "code",
- "execution_count": 194,
+ "execution_count": 64,
"id": "6f5836ec",
"metadata": {},
"outputs": [
@@ -1340,13 +1338,13 @@
"[5132 rows x 6 columns]"
]
},
- "execution_count": 194,
+ "execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "data_new"
+ "data_new_kbest"
]
},
{
@@ -1359,9 +1357,351 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 66,
"id": "67e27ab3",
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.8198636806231743\n",
+ "[[536 68]\n",
+ " [117 306]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_f= data_new_kbest['Diagnosis']\n",
+ "X_f=data_new_kbest.drop(['Diagnosis'],axis=1)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=42)\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "rfc = RandomForestClassifier(n_estimators=100) \n",
+ "rfc.fit(X_train, y_train)\n",
+ "y_pred = rfc.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Accuracy:\", accuracy)\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "print(cm)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e6b9595",
+ "metadata": {},
+ "source": [
+ "Applying a random forest classifier on the dataset obtained by using f-statistic in selectkbest, we also bobserve the model accuracy and the confusion matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "f63ffb4d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of selected features: 8\n",
+ "Selected Features: Index(['Age', 'BMI', 'Chol', 'TG', 'HDL', 'LDL', 'Cr', 'BUN'], dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.feature_selection import RFECV\n",
+ "X_rfecv = data_new.drop('Diagnosis', axis=1)\n",
+ "y_rfecv = data_new['Diagnosis']\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n",
+ "rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=1000), cv=5) \n",
+ "rfecv.fit(X_train, y_train)\n",
+ "print(\"Number of selected features:\", rfecv.n_features_)\n",
+ "selected_features = X.columns[rfecv.support_]\n",
+ "print(\"Selected Features:\", selected_features)\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9ef9701",
+ "metadata": {},
+ "source": [
+ "Applying Recursive Feature Elimination with Cross Validation to filter out features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "22dbb42e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_new_rfecv=data_new\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "d96d1966",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " BMI | \n",
+ " Chol | \n",
+ " TG | \n",
+ " HDL | \n",
+ " LDL | \n",
+ " Cr | \n",
+ " BUN | \n",
+ " Diagnosis | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 50 | \n",
+ " 24.0 | \n",
+ " 4.20 | \n",
+ " 0.90 | \n",
+ " 2.40 | \n",
+ " 1.40 | \n",
+ " 46.0 | \n",
+ " 4.70 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 26 | \n",
+ " 23.0 | \n",
+ " 3.70 | \n",
+ " 1.40 | \n",
+ " 1.10 | \n",
+ " 2.10 | \n",
+ " 62.0 | \n",
+ " 4.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 33 | \n",
+ " 21.0 | \n",
+ " 4.90 | \n",
+ " 1.00 | \n",
+ " 0.80 | \n",
+ " 2.00 | \n",
+ " 46.0 | \n",
+ " 7.10 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 45 | \n",
+ " 21.0 | \n",
+ " 2.90 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 1.50 | \n",
+ " 24.0 | \n",
+ " 2.30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 50 | \n",
+ " 24.0 | \n",
+ " 3.60 | \n",
+ " 1.30 | \n",
+ " 0.90 | \n",
+ " 2.10 | \n",
+ " 50.0 | \n",
+ " 2.00 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 5127 | \n",
+ " 54 | \n",
+ " 23.0 | \n",
+ " 5.00 | \n",
+ " 1.50 | \n",
+ " 1.24 | \n",
+ " 2.98 | \n",
+ " 77.0 | \n",
+ " 3.50 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5128 | \n",
+ " 50 | \n",
+ " 22.0 | \n",
+ " 4.37 | \n",
+ " 2.09 | \n",
+ " 1.37 | \n",
+ " 2.29 | \n",
+ " 47.3 | \n",
+ " 4.40 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5129 | \n",
+ " 67 | \n",
+ " 24.0 | \n",
+ " 3.89 | \n",
+ " 1.38 | \n",
+ " 1.14 | \n",
+ " 2.17 | \n",
+ " 70.6 | \n",
+ " 4.73 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5130 | \n",
+ " 60 | \n",
+ " 29.0 | \n",
+ " 5.91 | \n",
+ " 1.29 | \n",
+ " 1.73 | \n",
+ " 2.85 | \n",
+ " 50.2 | \n",
+ " 7.33 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 5131 | \n",
+ " 37 | \n",
+ " 34.0 | \n",
+ " 5.42 | \n",
+ " 2.66 | \n",
+ " 1.08 | \n",
+ " 2.87 | \n",
+ " 75.5 | \n",
+ " 4.61 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5132 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age BMI Chol TG HDL LDL Cr BUN Diagnosis\n",
+ "0 50 24.0 4.20 0.90 2.40 1.40 46.0 4.70 0\n",
+ "1 26 23.0 3.70 1.40 1.10 2.10 62.0 4.50 0\n",
+ "2 33 21.0 4.90 1.00 0.80 2.00 46.0 7.10 0\n",
+ "3 45 21.0 2.90 1.00 1.00 1.50 24.0 2.30 0\n",
+ "4 50 24.0 3.60 1.30 0.90 2.10 50.0 2.00 0\n",
+ "... ... ... ... ... ... ... ... ... ...\n",
+ "5127 54 23.0 5.00 1.50 1.24 2.98 77.0 3.50 1\n",
+ "5128 50 22.0 4.37 2.09 1.37 2.29 47.3 4.40 1\n",
+ "5129 67 24.0 3.89 1.38 1.14 2.17 70.6 4.73 1\n",
+ "5130 60 29.0 5.91 1.29 1.73 2.85 50.2 7.33 1\n",
+ "5131 37 34.0 5.42 2.66 1.08 2.87 75.5 4.61 1\n",
+ "\n",
+ "[5132 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_new_rfecv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df381a5f",
+ "metadata": {},
+ "source": [
+ "New dataset generated after using RFECV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "084425a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.818889970788705\n",
+ "[[539 65]\n",
+ " [121 302]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_rfecv= data_new_rfecv['Diagnosis']\n",
+ "X_rfecv=data_new_rfecv.drop(['Diagnosis'],axis=1)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y_rfecv, test_size=0.2, random_state=42)\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "rfc = RandomForestClassifier(n_estimators=100) \n",
+ "rfc.fit(X_train, y_train)\n",
+ "y_pred = rfc.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Accuracy:\", accuracy)\n",
+ "cm = confusion_matrix(y_test, y_pred)\n",
+ "print(cm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a52ed238",
+ "metadata": {},
+ "source": [
+ "Applying a random forest classifier on the dataset obtained by using RFECV, we also bobserve the model accuracy and the confusion matrix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59349ad6",
+ "metadata": {},
+ "source": [
+ "Since, we observe that the accuracy score of the model trained on datset generated by selectkbest is slightly higher that that trained by dataset generate by rfecv, hence we will use the 5 features selected by selectkbest as they have the most significant relationship with diagnosis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "82916004",
+ "metadata": {},
"outputs": [],
"source": []
}