From 9634997a661474ff02e250e197356c346c37639a Mon Sep 17 00:00:00 2001 From: Diwakar Gupta <39624018+Diwakar-Gupta@users.noreply.github.com> Date: Sun, 14 Aug 2022 14:38:59 +0530 Subject: [PATCH] Spam_Ham_Classifier --- 22-08-14-NLP/Spam_Ham_Classifier.ipynb | 728 +++++++++++++++++++++++++ 1 file changed, 728 insertions(+) create mode 100644 22-08-14-NLP/Spam_Ham_Classifier.ipynb diff --git a/22-08-14-NLP/Spam_Ham_Classifier.ipynb b/22-08-14-NLP/Spam_Ham_Classifier.ipynb new file mode 100644 index 0000000..1fb1b53 --- /dev/null +++ b/22-08-14-NLP/Spam_Ham_Classifier.ipynb @@ -0,0 +1,728 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Spam_Ham_Classifier_April.ipynb", + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "871U7ZuiYLZw" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "source": [ + "messages=pd.read_csv('https://raw.githubusercontent.com/Pepcoders/Data-Science-January/main/NLP/sms',sep='\\t',names=['label','message'])" + ], + "metadata": { + "id": "I7Msp7itYj2R" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "messages" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "pq4rXQk3Yk4q", + "outputId": "819647f5-e0d3-4dfa-e7dc-9dbe75588969" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " label message\n", + "0 ham Go until jurong point, crazy.. Available only ...\n", + "1 ham Ok lar... Joking wif u oni...\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", + "3 ham U dun say so early hor... U c already then say...\n", + "4 ham Nah I don't think he goes to usf, he lives aro...\n", + "... ... ...\n", + "5567 spam This is the 2nd time we have tried 2 contact u...\n", + "5568 ham Will ü b going to esplanade fr home?\n", + "5569 ham Pity, * was in mood for that. So...any other s...\n", + "5570 ham The guy did some bitching but I acted like i'd...\n", + "5571 ham Rofl. Its true to its name\n", + "\n", + "[5572 rows x 2 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelmessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
.........
5567spamThis is the 2nd time we have tried 2 contact u...
5568hamWill ü b going to esplanade fr home?
5569hamPity, * was in mood for that. So...any other s...
5570hamThe guy did some bitching but I acted like i'd...
5571hamRofl. Its true to its name
\n", + "

5572 rows × 2 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Data Cleaning and Processing\n", + "import nltk\n", + "import re\n", + "nltk.download('stopwords')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0VMxeL37Yl3S", + "outputId": "7e136a17-47cf-489c-cd07-2d5ff4badc0a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk.corpus import stopwords\n", + "from nltk.stem.porter import PorterStemmer\n" + ], + "metadata": { + "id": "9GuPp659Yo7F" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ps=PorterStemmer()\n", + "corpus=[]" + ], + "metadata": { + "id": "KqkCObDMYqMR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "for i in range(0,len(messages)):\n", + " review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])\n", + " review=review.lower()\n", + " review=review.split()\n", + "\n", + " review=[ps.stem(word) for word in review if not word in stopwords.words('english')]\n", + " review=\" \".join(review)\n", + " corpus.append(review)\n", + "\n" + ], + "metadata": { + "id": "rxFW9nGaYt_K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "## Create Bag Of Words Model\n", + "## Convert words to numeric form\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "cv=CountVectorizer(max_features=3000)\n", + "X=cv.fit_transform(corpus).toarray()" + ], + "metadata": { + "id": "qBoy9Zz1YvlC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "messages['label']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wJ7uRMHWY1QV", + "outputId": "e108b7e0-c192-4308-cd23-2a81efb50862" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 ham\n", + "1 ham\n", + "2 spam\n", + "3 ham\n", + "4 ham\n", + " ... \n", + "5567 spam\n", + "5568 ham\n", + "5569 ham\n", + "5570 ham\n", + "5571 ham\n", + "Name: label, Length: 5572, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "lb=LabelEncoder()\n", + "y=lb.fit_transform(messages['label'])" + ], + "metadata": { + "id": "O0zFC_BvY3GJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "y" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y0yDIIdVY5I5", + "outputId": "68b723c1-151a-4abb-da03-fc28e9c6b9b8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([0, 0, 1, ..., 0, 0, 0])" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "X" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IAIdHPTmY7sK", + "outputId": "d23d435a-742c-4d02-ef4b-fa05a5764df1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0]])" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "## Model Building\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)" + ], + "metadata": { + "id": "_tGj9L80Y9Da" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.naive_bayes import MultinomialNB,GaussianNB\n", + "\n", + "spam_detect_model_1=MultinomialNB()\n", + "\n", + "spam_detect_model_2=GaussianNB()\n" + ], + "metadata": { + "id": "f_57yerKY_VD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_1.fit(X_train,y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AomcCaIAZSLJ", + "outputId": "a1b78ab3-087f-4928-fcf9-059cf6c616c4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "MultinomialNB()" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_2.fit(X_train,y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tPrgljEDZTz_", + "outputId": "adec0210-377d-43ad-c533-8a9df765c95e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "GaussianNB()" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_1.score(X_train,y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DunME4KgZU2C", + "outputId": "e8609db1-f633-48ae-f853-c125b5300802" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.989230424052053" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_2.score(X_train,y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HVcrLrKxZV52", + "outputId": "6f92f405-3ac0-4e31-f82d-9e90efd82db2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8873681848777204" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_1.score(X_test,y_test)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Tl0Op1wwZYxL", + "outputId": "c2045515-01c2-42f0-f75b-928811f78626" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9838565022421525" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spam_detect_model_2.score(X_test,y_test)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FWWjotY3ZaLZ", + "outputId": "bda5673b-a874-4cfb-ce0d-f476205c787a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.8600896860986547" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "###############CONCLUSION##################\n", + "#### MULTINOMIAL NB IS PERFORMING WELL ####" + ], + "metadata": { + "id": "mkmVgvEHZbYI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_test" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OEgI4b9fZkT9", + "outputId": "c4bcfec6-3b15-4979-80e9-2e5b800b4da1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0]])" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "set(spam_detect_model_1.predict(X_test))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sfJQayeaZoH8", + "outputId": "d22300ad-a956-457d-9f99-a10c38740658" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{0, 1}" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "Ivfr0OhLZpaN" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file