This repository has been archived by the owner on Jan 10, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
171 lines (122 loc) · 4.47 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Data Preprocessing
def get_cleaned_data():
df = pd.read_csv('spam.csv', delimiter=',')
df.drop(df.columns.difference(['v1', 'v2']), 1, inplace=True)
nRow, nCol = df.shape
df = df.rename({
"v1": "Category",
"v2": "Message"
}, axis=1)
df["Label"] = df["Category"].map({
"ham": 0,
"spam": 1,
})
# delete null values and delete duplicates
df = df.dropna()
df1 = df.loc[df['Label'] == 0].drop_duplicates()
df = pd.concat([df1,df.loc[df['Label'] == 1]])
# # Appending 2nd dataset
df2 = pd.read_csv('spamExtra.csv', delimiter=',')
df2.drop(df2.columns.difference(['v1', 'v2']), 1, inplace=True)
nRow, nCol = df2.shape
df2 = df2.rename({
"v1": "Category",
"v2": "Message"
}, axis=1)
df2["Label"] = df2["Category"].map({
"ham": 0,
"spam": 1,
})
# delete them
df2 = df2.dropna()
# Select rows belonging to class 1 from df2
df2_class_1 = df2.loc[df2['Label'] == 1]
df = pd.concat([df, df2_class_1])
# +++++++++++++++++++++++++++++++++++++++++++++++++
print(f'There are {df.shape[0]} rows and {df.shape[1]} columns')
# print value count
print(df["Category"].value_counts())
# plot data points, we can see its very imbalanced
sns.countplot(data=df, x="Category")
plt.title("ham vs spam")
plt.show()
# preproccessing
ps = PorterStemmer()
def clean_data():
corpus = []
for msg in df["Message"]:
# replace everything thats not a letter with white space
msg = re.sub("[^a-zA-Z]", " ", msg)
# replace new line with whitespace
msg = msg.replace('\n', '')
# convert to lowercase
msg = msg.lower()
# split the word into individual word list
msg = msg.split()
# perform stemming using PorterStemmer for all non-english-stopwords
msg = [ps.stem(words)
for words in msg
if words not in set(stopwords.words("english"))
]
# join the word lists with the whitespace
msg = " ".join(msg)
corpus.append(msg)
return corpus
data = {'Message': clean_data(), 'Label': df.Label}
df_new = pd.DataFrame(data=data)
# new df head
print(df_new.head())
# split our data from training & testing
X_train, X_test, y_train, y_test = train_test_split(df_new.Message, df_new.Label, test_size=0.2, random_state=0)
print(f'X_train: {X_train.shape}')
# tokenize our data
vocab_size = 10000 # only consider this many words
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = np.array(tokenizer.texts_to_sequences(X_train), dtype=object)
X_test = np.array(tokenizer.texts_to_sequences(X_test), dtype=object)
# add padding to make tokens equal size
sentence_len = 200
X_train = pad_sequences(X_train, maxlen=sentence_len)
X_test = pad_sequences(X_test, maxlen=sentence_len)
# Create a SMOTE object
smote = SMOTE(random_state=42)
# Transform the data using the SMOTE object
X_train, y_train = smote.fit_resample(X_train, y_train)
# Count the number of spam and ham samples
class_counts = Counter(y_train)
# Create a bar plot of the class counts
ax = sns.barplot(x=list(class_counts.keys()), y=list(class_counts.values()), color="blue")
# Set the x-axis labels
ax.set_xticklabels(["Ham", "Spam"])
# Set the y-axis label
ax.set_ylabel("Count")
# Set the plot title
ax.set_title("Spam vs. Ham Counts")
# Show the plot
plt.show()
return X_train, X_test, y_train, y_test, vocab_size, sentence_len
# Helper Functions
def generate_model_output(y_test, y_pred):
ax = plt.subplot()
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, ax = ax,cmap='Blues',fmt='');
ax.set_title('Confusion Matrix');
ax.set_xlabel('Predicted labels');
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']);
ax.set_ylabel('True labels');
ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);
print("Classification Report\n", classification_report(y_test, y_pred), "\n")
print("Accuracy Score:", accuracy_score(y_test, y_pred))