-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_data_nn.py
98 lines (72 loc) · 3.19 KB
/
clean_data_nn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# coding: utf-8
import pandas as pd
import re
from sklearn import model_selection
import glob
import re
from nltk.tokenize import word_tokenize
from collections import Counter
from itertools import chain
import string
import matplotlib
import matplotlib.pyplot as plt
import nltk
#reading data
b1 = pd.read_csv("Coded Tweets (Aggregated)/Tweets Batch 1 (N=1000)_ALLRATERS.csv")
b2 = pd.read_csv("Coded Tweets (Aggregated)/Tweets Batch 2 (N=2000)_ALLRATERS.csv")
b3 = pd.read_csv("Coded Tweets (Aggregated)/Tweets Batch 3 (N=2000)_ALLRATERS.csv")
b4 = pd.read_csv("Coded Tweets (Aggregated)/Tweets Batch 4 (N=1000)_ALLRATERS.csv")
b5 = pd.read_csv("Coded Tweets (Aggregated)/Tweets Batch 5 (N=1000)_ALLRATERS.csv")
#cleaning column names
b1 = b1.rename(columns = {'hatespeech7namecalaling1':'hatespeech7', 'nanmecalling2':'namecalling2'})
b2 = b2.rename(columns = {'hatespeech7namecalaling1':'hatespeech7', 'nanmecalling2':'namecalling2'})
b3 = b3.rename(columns = {'hatespeech7namecalaling1':'hatespeech7', 'nanmecalling2':'namecalling2'})
b4 = b4.rename(columns = {'hatespeech7namecalaling1':'hatespeech7', 'nanmecalling2':'namecalling2'})
b5 = b5.rename(columns = {'hatespeech7namecalaling1':'hatespeech7', 'nanmecalling2':'namecalling2'})
#concatenate batches
frame = pd.concat([b1, b2, b3, b4, b5], axis = 0)
def clean_tweet(tweet):
# Remover trailing whitespace
tweet = tweet.strip()
# Remove @ mention
tweet = re.sub(r'RT @[A-Za-z0-9:_]+', '', tweet) # Remove the @ mention
tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet) # Remove the @ mention
#Fix html punctuation
tweet = re.sub("&", "&", tweet)
tweet = re.sub("<", "<", tweet)
tweet = re.sub(">", ">", tweet)
# Remove Hyperlinks
tweet = re.sub(r"http\S+", "", tweet).lower()
# Remove hashtag
tweet = re.sub("#", " ", tweet)
# Separate Punctuation
tweet = word_tokenize(tweet)
tweet = ' '.join(c for c in tweet)
#Remove trailing whitespace after transformation
tweet = tweet.strip()
return tweet
tweets=frame["Tweet"].tolist()
clean_tweets= []
for tweet in tweets:
z= clean_tweet(str(tweet))
clean_tweets.append(z)
frame['clean_tweet'] = clean_tweets
labels = ['CAPS', 'Obscenity', 'Threat', 'hatespeech', 'namecalling', 'negprejudice', 'noneng', 'porn', 'stereotypes']
for label in labels:
cols = [label + str(x) for x in range(1,8)]
frame[label + '_num_yes'] = frame[cols].sum(axis = 1)
frame[label] = pd.Series(frame[label + '_num_yes'] >= 2).astype(int)
frame.to_csv('clean_data_nn.csv')
train, test = model_selection.train_test_split(frame, test_size = 0.2, random_state = 123)
# these steps alter the number of rows in the dataframe, so they must be run AFTER the split
# to avoid small changes in the clean_tweet function affecting the split of the data
train = train[train['clean_tweet'] != ""]
train = train[train.clean_tweet.isnull() == False]
train['ID'] = range(0, train.shape[0])
train.reset_index(inplace = True, drop = True)
test = test[test['clean_tweet'] != ""]
test = test[test.clean_tweet.isnull() == False]
test['ID'] = range(0, test.shape[0])
test.reset_index(inplace = True, drop = True)
train.to_csv("train_nn.csv")
test.to_csv("test_nn.csv")