From 9634997a661474ff02e250e197356c346c37639a Mon Sep 17 00:00:00 2001
From: Diwakar Gupta <39624018+Diwakar-Gupta@users.noreply.github.com>
Date: Sun, 14 Aug 2022 14:38:59 +0530
Subject: [PATCH] Spam_Ham_Classifier
---
22-08-14-NLP/Spam_Ham_Classifier.ipynb | 728 +++++++++++++++++++++++++
1 file changed, 728 insertions(+)
create mode 100644 22-08-14-NLP/Spam_Ham_Classifier.ipynb
diff --git a/22-08-14-NLP/Spam_Ham_Classifier.ipynb b/22-08-14-NLP/Spam_Ham_Classifier.ipynb
new file mode 100644
index 0000000..1fb1b53
--- /dev/null
+++ b/22-08-14-NLP/Spam_Ham_Classifier.ipynb
@@ -0,0 +1,728 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "Spam_Ham_Classifier_April.ipynb",
+ "provenance": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "871U7ZuiYLZw"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "messages=pd.read_csv('https://raw.githubusercontent.com/Pepcoders/Data-Science-January/main/NLP/sms',sep='\\t',names=['label','message'])"
+ ],
+ "metadata": {
+ "id": "I7Msp7itYj2R"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "messages"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "pq4rXQk3Yk4q",
+ "outputId": "819647f5-e0d3-4dfa-e7dc-9dbe75588969"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " label message\n",
+ "0 ham Go until jurong point, crazy.. Available only ...\n",
+ "1 ham Ok lar... Joking wif u oni...\n",
+ "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
+ "3 ham U dun say so early hor... U c already then say...\n",
+ "4 ham Nah I don't think he goes to usf, he lives aro...\n",
+ "... ... ...\n",
+ "5567 spam This is the 2nd time we have tried 2 contact u...\n",
+ "5568 ham Will ü b going to esplanade fr home?\n",
+ "5569 ham Pity, * was in mood for that. So...any other s...\n",
+ "5570 ham The guy did some bitching but I acted like i'd...\n",
+ "5571 ham Rofl. Its true to its name\n",
+ "\n",
+ "[5572 rows x 2 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " label | \n",
+ " message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ham | \n",
+ " Go until jurong point, crazy.. Available only ... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ham | \n",
+ " Ok lar... Joking wif u oni... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " spam | \n",
+ " Free entry in 2 a wkly comp to win FA Cup fina... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ham | \n",
+ " U dun say so early hor... U c already then say... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ham | \n",
+ " Nah I don't think he goes to usf, he lives aro... | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 5567 | \n",
+ " spam | \n",
+ " This is the 2nd time we have tried 2 contact u... | \n",
+ "
\n",
+ " \n",
+ " 5568 | \n",
+ " ham | \n",
+ " Will ü b going to esplanade fr home? | \n",
+ "
\n",
+ " \n",
+ " 5569 | \n",
+ " ham | \n",
+ " Pity, * was in mood for that. So...any other s... | \n",
+ "
\n",
+ " \n",
+ " 5570 | \n",
+ " ham | \n",
+ " The guy did some bitching but I acted like i'd... | \n",
+ "
\n",
+ " \n",
+ " 5571 | \n",
+ " ham | \n",
+ " Rofl. Its true to its name | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5572 rows × 2 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Data Cleaning and Processing\n",
+ "import nltk\n",
+ "import re\n",
+ "nltk.download('stopwords')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0VMxeL37Yl3S",
+ "outputId": "7e136a17-47cf-489c-cd07-2d5ff4badc0a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem.porter import PorterStemmer\n"
+ ],
+ "metadata": {
+ "id": "9GuPp659Yo7F"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ps=PorterStemmer()\n",
+ "corpus=[]"
+ ],
+ "metadata": {
+ "id": "KqkCObDMYqMR"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "for i in range(0,len(messages)):\n",
+ " review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])\n",
+ " review=review.lower()\n",
+ " review=review.split()\n",
+ "\n",
+ " review=[ps.stem(word) for word in review if not word in stopwords.words('english')]\n",
+ " review=\" \".join(review)\n",
+ " corpus.append(review)\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "rxFW9nGaYt_K"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## Create Bag Of Words Model\n",
+ "## Convert words to numeric form\n",
+ "\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "cv=CountVectorizer(max_features=3000)\n",
+ "X=cv.fit_transform(corpus).toarray()"
+ ],
+ "metadata": {
+ "id": "qBoy9Zz1YvlC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "messages['label']"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wJ7uRMHWY1QV",
+ "outputId": "e108b7e0-c192-4308-cd23-2a81efb50862"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 ham\n",
+ "1 ham\n",
+ "2 spam\n",
+ "3 ham\n",
+ "4 ham\n",
+ " ... \n",
+ "5567 spam\n",
+ "5568 ham\n",
+ "5569 ham\n",
+ "5570 ham\n",
+ "5571 ham\n",
+ "Name: label, Length: 5572, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "\n",
+ "lb=LabelEncoder()\n",
+ "y=lb.fit_transform(messages['label'])"
+ ],
+ "metadata": {
+ "id": "O0zFC_BvY3GJ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "y"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "y0yDIIdVY5I5",
+ "outputId": "68b723c1-151a-4abb-da03-fc28e9c6b9b8"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([0, 0, 1, ..., 0, 0, 0])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "IAIdHPTmY7sK",
+ "outputId": "d23d435a-742c-4d02-ef4b-fa05a5764df1"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " ...,\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0]])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "## Model Building\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)"
+ ],
+ "metadata": {
+ "id": "_tGj9L80Y9Da"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.naive_bayes import MultinomialNB,GaussianNB\n",
+ "\n",
+ "spam_detect_model_1=MultinomialNB()\n",
+ "\n",
+ "spam_detect_model_2=GaussianNB()\n"
+ ],
+ "metadata": {
+ "id": "f_57yerKY_VD"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_1.fit(X_train,y_train)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "AomcCaIAZSLJ",
+ "outputId": "a1b78ab3-087f-4928-fcf9-059cf6c616c4"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "MultinomialNB()"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_2.fit(X_train,y_train)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "tPrgljEDZTz_",
+ "outputId": "adec0210-377d-43ad-c533-8a9df765c95e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "GaussianNB()"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_1.score(X_train,y_train)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "DunME4KgZU2C",
+ "outputId": "e8609db1-f633-48ae-f853-c125b5300802"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.989230424052053"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_2.score(X_train,y_train)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HVcrLrKxZV52",
+ "outputId": "6f92f405-3ac0-4e31-f82d-9e90efd82db2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.8873681848777204"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_1.score(X_test,y_test)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Tl0Op1wwZYxL",
+ "outputId": "c2045515-01c2-42f0-f75b-928811f78626"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.9838565022421525"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model_2.score(X_test,y_test)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FWWjotY3ZaLZ",
+ "outputId": "bda5673b-a874-4cfb-ce0d-f476205c787a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.8600896860986547"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "###############CONCLUSION##################\n",
+ "#### MULTINOMIAL NB IS PERFORMING WELL ####"
+ ],
+ "metadata": {
+ "id": "mkmVgvEHZbYI"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X_test"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OEgI4b9fZkT9",
+ "outputId": "c4bcfec6-3b15-4979-80e9-2e5b800b4da1"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([[0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " ...,\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0],\n",
+ " [0, 0, 0, ..., 0, 0, 0]])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 22
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "set(spam_detect_model_1.predict(X_test))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "sfJQayeaZoH8",
+ "outputId": "d22300ad-a956-457d-9f99-a10c38740658"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{0, 1}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ ""
+ ],
+ "metadata": {
+ "id": "Ivfr0OhLZpaN"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file