diff --git a/22-06-10-Naive_Bayes/Codes.ipynb b/22-06-10-Naive_Bayes/Codes.ipynb new file mode 100644 index 0000000..0e26067 --- /dev/null +++ b/22-06-10-Naive_Bayes/Codes.ipynb @@ -0,0 +1,2316 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Naive Bayes.ipynb", + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyMuPZYgLeZZEYtTydvuY0vm", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Notes**\n", + "\n", + "[open in browser]('https://onedrive.live.com/view.aspx?resid=B7999797B4FEAE64%211201&id=documents&wd=target%28Naive%20Bayes.one%7C5A03ED1E-CF67-4C12-AD8B-C16112311CD6%2FIntro%7C26B8385D-A330-42F6-9B6F-C565F43A2472%2F%29')\n", + "\n", + "[open in OneNote]('onenote:https://d.docs.live.net/b7999797b4feae64/Documents/DS%20December/Naive%20Bayes.one#Intro§ion-id={5A03ED1E-CF67-4C12-AD8B-C16112311CD6}&page-id={26B8385D-A330-42F6-9B6F-C565F43A2472}&end')" + ], + "metadata": { + "id": "tkHggcg_bKPf" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "bqorb4nTy4XE" + }, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Scratch\n", + "\n", + "i used this to explane implementation **play_tennis**" + ], + "metadata": { + "id": "Tj6lBRxtTDBD" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv('https://gist.githubusercontent.com/DiogoRibeiro7/c6590d0cf119e87c39e31c21a9c0f3a8/raw/4a8e3da267a0c1f0d650901d8295a5153bde8b21/PlayTennis.csv')\n", + "df" + ], + "metadata": { + "id": "ti-n0DM8sgYd", + "outputId": "5030f2f9-6091-4406-fd66-827f57d10250", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 488 + } + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Outlook Temperature Humidity Wind Play Tennis\n", + "0 Sunny Hot High Weak No\n", + "1 Sunny Hot High Strong No\n", + "2 Overcast Hot High Weak Yes\n", + "3 Rain Mild High Weak Yes\n", + "4 Rain Cool Normal Weak Yes\n", + "5 Rain Cool Normal Strong No\n", + "6 Overcast Cool Normal Strong Yes\n", + "7 Sunny Mild High Weak No\n", + "8 Sunny Cool Normal Weak Yes\n", + "9 Rain Mild Normal Weak Yes\n", + "10 Sunny Mild Normal Strong Yes\n", + "11 Overcast Mild High Strong Yes\n", + "12 Overcast Hot Normal Weak Yes\n", + "13 Rain Mild High Strong No" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OutlookTemperatureHumidityWindPlay Tennis
0SunnyHotHighWeakNo
1SunnyHotHighStrongNo
2OvercastHotHighWeakYes
3RainMildHighWeakYes
4RainCoolNormalWeakYes
5RainCoolNormalStrongNo
6OvercastCoolNormalStrongYes
7SunnyMildHighWeakNo
8SunnyCoolNormalWeakYes
9RainMildNormalWeakYes
10SunnyMildNormalStrongYes
11OvercastMildHighStrongYes
12OvercastHotNormalWeakYes
13RainMildHighStrongNo
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pd.crosstab(df['Outlook'], df['Play Tennis'], normalize='columns')" + ], + "metadata": { + "id": "o20tVEUhslOa", + "outputId": "36bc0063-99d6-4885-b767-73f1cecdef03", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Play Tennis No Yes\n", + "Outlook \n", + "Overcast 0.0 0.444444\n", + "Rain 0.4 0.333333\n", + "Sunny 0.6 0.222222" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Play TennisNoYes
Outlook
Overcast0.00.444444
Rain0.40.333333
Sunny0.60.222222
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "Kg84gTlSslLj" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "iris = px.data.iris()" + ], + "metadata": { + "id": "d6nEo9SGzWUy" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# X = iris.drop(['species_id', 'species'], axis = 1)\n", + "# Y = iris['species_id']\n", + "\n", + "from sklearn.preprocessing import OrdinalEncoder\n", + "X = df.drop(['Play Tennis'], axis = 1)\n", + "Y = df['Play Tennis']" + ], + "metadata": { + "id": "CKh0-frNzll3" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "class NaiveBayes():\n", + " def __init__(self):\n", + " pass\n", + " \n", + " def fit(self, X, Y):\n", + " self.label_type = Y.dtype\n", + " self.prob = {}\n", + " self.label_prob = Y.value_counts(normalize=True)\n", + " self.features = X.columns\n", + "\n", + " for col in X.columns:\n", + " self.prob[col] = pd.crosstab(X[col], Y, normalize = 'columns')\n", + " \n", + " return self.prob\n", + " \n", + " def pred(self, x):\n", + " \n", + " maxprob = 0\n", + " maxlab = ''\n", + " for lab in self.label_prob.index:\n", + " currprob = self.label_prob[lab]\n", + " currlab = lab\n", + " \n", + " for fet in self.features:\n", + " p = self.prob[fet][lab][x[fet]]\n", + " currprob = currprob*p\n", + " \n", + " if currprob > maxprob:\n", + " maxprob = currprob\n", + " maxlab = currlab\n", + " \n", + " return maxlab\n", + "\n", + " def predict(self, X):\n", + " y_pred = np.full(X.shape[0], 0,dtype = self.label_type)\n", + "\n", + " for i, (_, x) in enumerate(X.iterrows()):\n", + " lab = self.pred(x)\n", + " y_pred[i] = lab\n", + " \n", + " return y_pred\n", + " \n", + " def score(self, X, Y):\n", + " return accuracy_score(self.predict(X), Y)" + ], + "metadata": { + "id": "-6klPoA31hkI" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = NaiveBayes()\n", + "prob = model.fit(X, Y)" + ], + "metadata": { + "id": "ucIUTwH42gZC" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.predict(X)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zj8LwA9TMD4n", + "outputId": "4441d02f-ef4c-4573-a6dd-79a8ae80d315" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',\n", + " 'Yes', 'Yes', 'Yes', 'No'], dtype=object)" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.score(X, Y)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "l1q-nnUIJMiK", + "outputId": "03f80920-8f45-4184-90eb-2e5a92513672" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9285714285714286" + ] + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "modelsk = MultinomialNB()\n", + "\n", + "X = OrdinalEncoder().fit_transform(X)\n", + "\n", + "modelsk.fit(X, Y)\n", + "# accuracy_score(modelsk.predict(X), Y)\n", + "modelsk.score(X, Y)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SPMrudjK2mIW", + "outputId": "8d34222c-6087-4f97-f449-68cc6cd2dcf4" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8571428571428571" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Project" + ], + "metadata": { + "id": "alxgN5iQTAOX" + } + }, + { + "cell_type": "code", + "source": [ + "data = df = pd.read_csv(\"https://raw.githubusercontent.com/DependerKumarSoni/Naive-Bayes/main/adult.csv\")\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "4t3OZXtcTBKr", + "outputId": "c035feff-1507-454a-baae-b7fb4da2aade" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 39 State-gov 77516 Bachelors 13 Never-married \\\n", + "0 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n", + "1 38 Private 215646 HS-grad 9 Divorced \n", + "2 53 Private 234721 11th 7 Married-civ-spouse \n", + "3 28 Private 338409 Bachelors 13 Married-civ-spouse \n", + "4 37 Private 284582 Masters 14 Married-civ-spouse \n", + "\n", + " Adm-clerical Not-in-family White Male 2174 0 40 \\\n", + "0 Exec-managerial Husband White Male 0 0 13 \n", + "1 Handlers-cleaners Not-in-family White Male 0 0 40 \n", + "2 Handlers-cleaners Husband Black Male 0 0 40 \n", + "3 Prof-specialty Wife Black Female 0 0 40 \n", + "4 Exec-managerial Wife White Female 0 0 40 \n", + "\n", + " United-States <=50K \n", + "0 United-States <=50K \n", + "1 United-States <=50K \n", + "2 United-States <=50K \n", + "3 Cuba <=50K \n", + "4 United-States <=50K " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
39State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
050Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
138Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
253Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
328Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
437Private284582Masters14Married-civ-spouseExec-managerialWifeWhiteFemale0040United-States<=50K
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 33 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data Cleaning" + ], + "metadata": { + "id": "Wq691TTZT_sY" + } + }, + { + "cell_type": "code", + "source": [ + "col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']\n", + "df.columns = col_names\n", + "df.sample(2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 193 + }, + "id": "276K_qWYTyWe", + "outputId": "5f44e832-385f-4de0-cedd-65a24196212d" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age workclass fnlwgt education education_num marital_status \\\n", + "4260 26 Private 333541 HS-grad 9 Never-married \n", + "30961 35 Private 301862 HS-grad 9 Never-married \n", + "\n", + " occupation relationship race sex capital_gain \\\n", + "4260 Other-service Not-in-family White Male 0 \n", + "30961 Craft-repair Unmarried White Male 0 \n", + "\n", + " capital_loss hours_per_week native_country income \n", + "4260 0 24 United-States <=50K \n", + "30961 0 50 United-States <=50K " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclassfnlwgteducationeducation_nummarital_statusoccupationrelationshipracesexcapital_gaincapital_losshours_per_weeknative_countryincome
426026Private333541HS-grad9Never-marriedOther-serviceNot-in-familyWhiteMale0024United-States<=50K
3096135Private301862HS-grad9Never-marriedCraft-repairUnmarriedWhiteMale0050United-States<=50K
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 34 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Objective**\n", + "The goal of this machine learning project is to predict whether a person makes over 50K a year or not given their demographic variation. This is a classification problem.\n", + "\n", + "\n", + "**1. Categorical Attributes**\n", + "\n", + "* **workclass**: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.\n", + "Individual work category\n", + "* **education**: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.\n", + "Individual's highest education degree\n", + "* **marital-status**: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.\n", + "Individual marital status\n", + "* **occupation**: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.\n", + "Individual's occupation\n", + "* **relationship**: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.\n", + "Individual's relation in a family\n", + "* **race**: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.\n", + "Race of Individual\n", + "* **sex**: Female, Male.\n", + "* **native-country**: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.\n", + "Individual's native country\n", + "\n", + "\n", + "**2. Continuous Attributes**\n", + "\n", + "* **age**: continuous.\n", + "Age of an individual\n", + "* **fnlwgt**: final weight, continuous.\n", + "The weights on the CPS files are controlled to independent estimates of the civilian noninstitutional population of the US. These are prepared monthly for us by Population Division here at the Census Bureau.\n", + "* **capital-gain**: continuous.\n", + "* **capital-loss**: continuous.\n", + "* **hours-per-week**: continuous.\n", + "Individual's working hour per week" + ], + "metadata": { + "id": "YxoEyfG-UZ3Q" + } + }, + { + "cell_type": "code", + "source": [ + "attrib, counts = np.unique(data['workclass'], return_counts = True)\n", + "most_freq_attrib = attrib[np.argmax(counts, axis = 0)]\n", + "data['workclass'][data['workclass'] == ' ?'] = most_freq_attrib" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kdWMMYohUB4m", + "outputId": "135c50ab-ea5d-4157-ae41-ef11ddbeec98" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data['occupation'].replace({' ?': np.nan}, inplace = True)\n", + "data['native_country'].replace({' ?': np.nan}, inplace = True)" + ], + "metadata": { + "id": "B-wD2y7fUi-q" + }, + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.impute import SimpleImputer\n", + "\n", + "si = SimpleImputer(strategy = 'most_frequent')\n", + "data[['occupation', 'native_country']] = si.fit_transform(data[['occupation', 'native_country']])" + ], + "metadata": { + "id": "jPOZFeaaUou2" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data['income'] = np.where(data['income'] == ' <=50K', 0, 1)" + ], + "metadata": { + "id": "w-BrC12KUuSz" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## EDA" + ], + "metadata": { + "id": "qTlskxwTU5Hk" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Feature Engineering" + ], + "metadata": { + "id": "IuG6784AU8sx" + } + }, + { + "cell_type": "code", + "source": [ + "num_col = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']" + ], + "metadata": { + "id": "Ly--b-01Uzxy" + }, + "execution_count": 39, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "y = data['income']" + ], + "metadata": { + "id": "TPlyHYjkVBZy" + }, + "execution_count": 40, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "num_data = data[num_col]\n", + "cat_data = data.drop(num_col, axis = 1).drop(['income'], axis = 1)" + ], + "metadata": { + "id": "4mz1e_gvVCzl" + }, + "execution_count": 41, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Select and Train" + ], + "metadata": { + "id": "BpBROLCKVPZU" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Numerical" + ], + "metadata": { + "id": "u0bE1N-ZVTfn" + } + }, + { + "cell_type": "code", + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "for col in num_col:\n", + " sns.distplot(num_data[col])\n", + " plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "0l8POSxKVOGQ", + "outputId": "71e31fee-ed79-4ac5-c5de-1a2f7acb02d4" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", + " warnings.warn(msg, FutureWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "source": [ + "X = num_data.drop(['capital_gain', 'capital_loss'], axis = 1)\n", + "X['capital_change'] = data['capital_gain']-data['capital_loss']" + ], + "metadata": { + "id": "xhzpnZ7WVbeM" + }, + "execution_count": 43, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import RobustScaler, MinMaxScaler\n", + "\n", + "a = RobustScaler().fit_transform(X)\n", + "X = pd.DataFrame(a, columns = X.columns)\n", + "\n", + "xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=20)" + ], + "metadata": { + "id": "nziUKkh-VwUm" + }, + "execution_count": 44, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "gnb = GaussianNB()\n", + "gnb.fit(xtrain, ytrain)\n", + "gnb.score(xtest, ytest)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QnFAULkAV-sZ", + "outputId": "ac38acc2-cf96-485e-8fa6-2160337d8e04" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.7896805896805896" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Categorical " + ], + "metadata": { + "id": "yiPgoRtKWOFw" + } + }, + { + "cell_type": "code", + "source": [ + "Z = cat_data" + ], + "metadata": { + "id": "hlcb3OT_WOaT" + }, + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", + "# Label Encoder Can be used here\n", + "\n", + "od = OrdinalEncoder()\n", + "Z[Z.columns] = od.fit_transform(Z)" + ], + "metadata": { + "id": "OQZYtDBAWQen" + }, + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Z.sample(7)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "8FOCsjO2Wyua", + "outputId": "f0f4c1f3-483e-416a-81eb-9deeb6861d09" + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " workclass education education_num marital_status occupation \\\n", + "30801 3.0 7.0 11.0 2.0 11.0 \n", + "16043 3.0 11.0 8.0 0.0 11.0 \n", + "9015 3.0 9.0 12.0 2.0 11.0 \n", + "22087 3.0 15.0 9.0 2.0 3.0 \n", + "22054 3.0 15.0 9.0 2.0 0.0 \n", + "28671 3.0 15.0 9.0 2.0 2.0 \n", + "16541 3.0 12.0 13.0 2.0 9.0 \n", + "\n", + " relationship race sex native_country \n", + "30801 0.0 3.0 1.0 23.0 \n", + "16043 2.0 4.0 0.0 38.0 \n", + "9015 0.0 2.0 1.0 38.0 \n", + "22087 3.0 4.0 1.0 38.0 \n", + "22054 0.0 1.0 1.0 25.0 \n", + "28671 0.0 4.0 1.0 38.0 \n", + "16541 0.0 4.0 1.0 38.0 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
workclasseducationeducation_nummarital_statusoccupationrelationshipracesexnative_country
308013.07.011.02.011.00.03.01.023.0
160433.011.08.00.011.02.04.00.038.0
90153.09.012.02.011.00.02.01.038.0
220873.015.09.02.03.03.04.01.038.0
220543.015.09.02.00.00.01.01.025.0
286713.015.09.02.02.00.04.01.038.0
165413.012.013.02.09.00.04.01.038.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "xtrain, xtest, ytrain, ytest = train_test_split(Z, y, random_state=20)" + ], + "metadata": { + "id": "YVN65qCBW53H" + }, + "execution_count": 49, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "mnb = MultinomialNB()\n", + "mnb.fit(xtrain, ytrain)\n", + "print('Test', mnb.score(xtest, ytest))\n", + "print('Train', mnb.score(xtrain, ytrain))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "F11VxUB6W6ZK", + "outputId": "977f1538-f394-4825-e885-1b4afd334885" + }, + "execution_count": 50, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test 0.7932432432432432\n", + "Train 0.8002866502866502\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### MIX type features" + ], + "metadata": { + "id": "WddLwBfXXDhu" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install mixed-naive-bayes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_r8yYxr0XmGz", + "outputId": "95c8b30d-61ee-40a4-b454-4924b701b99d" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting mixed-naive-bayes\n", + " Downloading mixed_naive_bayes-0.0.1-py3-none-any.whl (8.9 kB)\n", + "Requirement already satisfied: numpy>=1.16.1 in /usr/local/lib/python3.7/dist-packages (from mixed-naive-bayes) (1.21.6)\n", + "Requirement already satisfied: scikit-learn>=0.20.2 in /usr/local/lib/python3.7/dist-packages (from mixed-naive-bayes) (1.0.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.2->mixed-naive-bayes) (3.1.0)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.2->mixed-naive-bayes) (1.1.0)\n", + "Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.2->mixed-naive-bayes) (1.4.1)\n", + "Installing collected packages: mixed-naive-bayes\n", + "Successfully installed mixed-naive-bayes-0.0.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "X = num_data\n", + "X[X.columns] = RobustScaler().fit_transform(X)\n", + "\n", + "Y = cat_data\n", + "od = OrdinalEncoder()\n", + "Y[Y.columns] = od.fit_transform(Y)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HsWURJ4nXrak", + "outputId": "b8f76962-8e19-466e-8d9d-b7ef907d3183" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:3678: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self[col] = igetitem(value, i)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "X = X.join(Y)\n", + "X.sample(7)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 333 + }, + "id": "sW8oU_bMXxuX", + "outputId": "383fecf9-b1f0-41ba-e423-f78f3aed689d" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age fnlwgt capital_gain capital_loss hours_per_week workclass \\\n", + "4330 1.55 0.526836 0.0 0.0 -4.8 5.0 \n", + "28464 -0.25 1.116907 0.0 0.0 -0.4 3.0 \n", + "27872 0.25 1.179747 2885.0 0.0 1.0 3.0 \n", + "310 -0.15 -0.297476 7298.0 0.0 -1.0 3.0 \n", + "24317 0.05 -0.480008 0.0 0.0 0.0 3.0 \n", + "24178 -0.65 -0.434061 0.0 0.0 -4.0 1.0 \n", + "4327 -0.35 0.145836 0.0 0.0 -2.6 3.0 \n", + "\n", + " education education_num marital_status occupation relationship \\\n", + "4330 5.0 3.0 2.0 3.0 0.0 \n", + "28464 11.0 8.0 4.0 2.0 3.0 \n", + "27872 11.0 8.0 2.0 13.0 0.0 \n", + "310 12.0 13.0 2.0 3.0 0.0 \n", + "24317 11.0 8.0 2.0 9.0 0.0 \n", + "24178 9.0 12.0 4.0 9.0 3.0 \n", + "4327 11.0 8.0 4.0 5.0 3.0 \n", + "\n", + " race sex native_country \n", + "4330 4.0 1.0 38.0 \n", + "28464 2.0 1.0 38.0 \n", + "27872 4.0 1.0 38.0 \n", + "310 1.0 1.0 35.0 \n", + "24317 4.0 1.0 38.0 \n", + "24178 4.0 0.0 38.0 \n", + "4327 4.0 1.0 38.0 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agefnlwgtcapital_gaincapital_losshours_per_weekworkclasseducationeducation_nummarital_statusoccupationrelationshipracesexnative_country
43301.550.5268360.00.0-4.85.05.03.02.03.00.04.01.038.0
28464-0.251.1169070.00.0-0.43.011.08.04.02.03.02.01.038.0
278720.251.1797472885.00.01.03.011.08.02.013.00.04.01.038.0
310-0.15-0.2974767298.00.0-1.03.012.013.02.03.00.01.01.035.0
243170.05-0.4800080.00.00.03.011.08.02.09.00.04.01.038.0
24178-0.65-0.4340610.00.0-4.01.09.012.04.09.03.04.00.038.0
4327-0.350.1458360.00.0-2.63.011.08.04.05.03.04.01.038.0
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 53 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=20)" + ], + "metadata": { + "id": "KYMpJ-7GX0Fe" + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from mixed_naive_bayes import MixedNB\n", + "\n", + "model = MixedNB(categorical_features=np.arange(5, 14))\n", + "model.fit(xtrain, ytrain)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a3hpx7bdX2zG", + "outputId": "0907d88f-1337-4271-8233-4b9178f80e23" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ 8 16 16 7 14 6 5 2 41]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "MixedNB(alpha=0.5, var_smoothing=1e-09)" + ] + }, + "metadata": {}, + "execution_count": 55 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.score(xtest, ytest)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s-g1suFPYE2D", + "outputId": "57a308b6-3d10-4e21-9279-f7d5b2d323f0" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8242014742014742" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model = MixedNB(alpha=1, categorical_features=np.arange(5, 14))\n", + "model.fit(xtrain, ytrain)\n", + "model.score(xtest, ytest)" + ], + "metadata": { + "id": "8vQ85-Ln09hN", + "outputId": "fa6c0d8e-8347-464f-bde6-d890ede2e1d1", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ 8 16 16 7 14 6 5 2 41]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8242014742014742" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "tVoWYMgQ1NI9" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file