-
Notifications
You must be signed in to change notification settings - Fork 0
/
testing.txt
147 lines (111 loc) · 5.54 KB
/
testing.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# %% [code] {"_execution_state":"idle"}
## Importing packages
# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image:
# https://github.com/kaggle/docker-rstats
library(tidyverse) # metapackage with lots of helpful functions
## Running code
# In a notebook, you can run a single code cell by clicking in the cell and then hitting
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script,
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.
## Reading in files
# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below.
#list.files(path = "../input")
## Saving data
# If you save any files or images, these will be put in the "output" directory. You
# can see the output directory by committing and running your kernel (using the
# Commit & Run button) and then checking out the compiled version of your kernel.
#getwd()
# Import test data
data <- read.csv("../input/twitter-airline-sentiment/Tweets.csv")
prop.table(table(data$airline_sentiment))
head(data) # display the first 6 observations
print(summary(data))
install.packages("NLP")
library(tm)
tweet_data <- subset(data, airline_sentiment != 'neutral')
tweet_data <- subset(tweet_data, select=c('airline_sentiment', 'text'))
head(tweet_data)
dt <- sort(sample(nrow(tweet_data), nrow(tweet_data)*.7))
train<-tweet_data[dt,]
test<-tweet_data[-dt,]
train$airline_sentiment <- as.factor(train$airline_sentiment)
corpus <- VCorpus(VectorSource(c(train$text, test$text))) #save text into corpus
corpus <- tm_map(corpus, content_transformer(tolower)) # transfor to low case
corpus <- tm_map(corpus, PlainTextDocument, lazy = T) # creat a plain text document
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeWords, stopwords(kind = "english")) # remove stop words
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
#In our way to find document input features for our classifier, we want to put this corpus in the shape of a document matrix.
# A document matrix is a numeric matrix containing a column for each different word in our whole corpus, and a row for each document.
# A given cell equals to the frequency in a document for a given term.
dtm <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(dtm, 0.97) # keep terms that appear in at least 1% of the documents
important_words_df <- as.data.frame(as.matrix(sparse))
colnames(important_words_df) <- make.names(colnames(important_words_df))
important_words_train_df <- head(important_words_df, nrow(train))
important_words_test_df <- tail(important_words_df, nrow(test))
# Add to original dataframes
train_data_words_df <- cbind(train, important_words_train_df)
test_data_words_df <- cbind(test, important_words_test_df)
# Get rid of the original Text field
train_data_words_df$text <- NULL
test_data_words_df$text <- NULL
# train logistic model based in training data
log_model <- glm(airline_sentiment~., data=train_data_words_df, family=binomial)
summary(log_model)
# use our model on test data
log_pred <- predict(log_model, newdata=test_data_words_df, type="response")
log_pred[1:5]
# compare the predicted result and actual class
table(test_data_words_df$airline_sentiment, log_pred>.5)
# train naive bayes model
install.packages("e1071")
library(e1071)
install.packages("devtools")
install.packages("usethis")
library(devtools)
library(usethis)
naive_model <- naiveBayes(airline_sentiment~., data = train)
naive_predict <- predict(naive_model, test[,-1])
table(naive_predict, true = test_data_words_df$airline_sentiment)
naive_model1 <- naiveBayes(airline_sentiment~., data = train)
naive_predict1 <- predict(naive_model, test)
table(naive_predict, true = test$airline_sentiment)
# kmeans to determine the proximity of words
install.packages("fpc")
library(fpc)
positive <- subset(tweet_data, airline_sentiment=='positive' )
negative <- subset(tweet_data, airline_sentiment=='negative')
splitdata = function(text_to_analyse){
corpus <- VCorpus(VectorSource(text_to_analyse)) #save text into corpus
corpus <- tm_map(corpus, content_transformer(tolower)) # transfor to low case
corpus <- tm_map(corpus, PlainTextDocument, lazy = T) # creat a plain text document
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeWords, stopwords(kind = "english")) # remove stop words
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(dtm, 0.97)
important_words_df <- as.data.frame(as.matrix(sparse))
colnames(important_words_df) <- make.names(colnames(important_words_df))
return(important_words_df)
}
# prepare words in negative
neg_word = splitdata(negative$text)
# calculate the distance of each word
distance_neg <- dist(t(as.matrix(neg_word)), method = 'euclidean')
kmodel <- kmeans(distance_neg, 3)
kmodel
# plot the cluster
library(cluster)
clusplot(as.matrix(distance_neg), kmodel$cluster, color = T, shade = T, labels = 2, lines = 2, cex = 0.4)
pos_word <- splitdata(positive$text)
distance_pos <- dist(t(as.matrix(pos_word)), method = 'euclidean')
kmodel_pos <- kmeans(distance_pos, 3)
kmodel_pos
# plot the cluster
clusplot(as.matrix(distance_pos), kmodel_pos$cluster, color = T, shade = T, labels = 2, lines = 2, cex = 0.4)