forked from JulianHill/R-Tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_datumbox.r
123 lines (101 loc) · 3.16 KB
/
sentiment_datumbox.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# load packages
library(twitteR)
library(RCurl)
library(RJSONIO)
library(stringr)
getSentiment <- function (text, key){
text <- URLencode(text);
#save all the spaces, then get rid of the weird characters that break the API, then convert back the URL-encoded spaces.
text <- str_replace_all(text, "%20", " ");
text <- str_replace_all(text, "%\\d\\d", "");
text <- str_replace_all(text, " ", "%20");
if (str_length(text) > 360){
text <- substr(text, 0, 359);
}
##########################################
data <- getURL(paste("http://api.datumbox.com/1.0/TwitterSentimentAnalysis.json?api_key=", key, "&text=",text, sep=""))
js <- fromJSON(data, asText=TRUE);
# get mood probability
sentiment = js$output$result
###################################
data <- getURL(paste("http://api.datumbox.com/1.0/SubjectivityAnalysis.json?api_key=", key, "&text=",text, sep=""))
js <- fromJSON(data, asText=TRUE);
# get mood probability
subject = js$output$result
##################################
data <- getURL(paste("http://api.datumbox.com/1.0/TopicClassification.json?api_key=", key, "&text=",text, sep=""))
js <- fromJSON(data, asText=TRUE);
# get mood probability
topic = js$output$result
##################################
data <- getURL(paste("http://api.datumbox.com/1.0/GenderDetection.json?api_key=", key, "&text=",text, sep=""))
js <- fromJSON(data, asText=TRUE);
# get mood probability
gender = js$output$result
return(list(sentiment=sentiment,subject=subject,topic=topic,gender=gender))
}
#################
clean.text <- function(some_txt)
{
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
# define "tolower error handling" function
try.tolower = function(x)
{
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
some_txt = sapply(some_txt, try.tolower)
some_txt = some_txt[some_txt != ""]
names(some_txt) = NULL
return(some_txt)
}
# harvest tweets
tweets = searchTwitter("iPhone", n=200, lang="en")
# get text
tweet_txt = sapply(tweets, function(x) x$getText())
# clean text
tweet_clean = clean.text(tweet_txt)
#####################################
# how many tweets
tweet_num = length(tweet_clean)
# data frame (text, sentiment, score)
tweet_df = data.frame(text=tweet_clean, sentiment=rep("", tweet_num),
subject=1:tweet_num, topic=1:tweet_num, gender=1:tweet_num, stringsAsFactors=FALSE)
# apply function getSentiment
sentiment = rep(0, tweet_num)
for (i in 1:tweet_num)
{
tmp = getSentiment(tweet_clean[i], "API_KEY")
tweet_df$sentiment[i] = tmp$sentiment
tweet_df$subject[i] = tmp$subject
tweet_df$topic[i] = tmp$topic
tweet_df$gender[i] = tmp$gender
}