-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataAnalysis.R
198 lines (147 loc) · 6.15 KB
/
DataAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
library("rjson")
#Next line for exporting to HTML as PDF fails (it will not render any plots)
#dev.off()
#Read Original Statistics Data from UCI
MyData <- read.csv(file="./OnlineNewsPopularity.csv", header=TRUE, sep=",")
#Read data from refined csv of news articles (Not working)
#newsArticles = read.csv(file="./newsData.csv",header=TRUE,sep=",")
#Read news text data in json format
#json = fromJSON(file = "/Users/apurvatripathi/Desktop/Thesis Cloud/OnlineNewsData/webScrapper/mashableScrapper/convertedJSON.json")
#convert json to data frame
#textData = do.call(cbind, json)
#Summary of the data
summary(MyData)
#Summary of shares
summary(MyData$shares)
#Head of the data
head(MyData[1:5,])
#Std Deviation of the data
sd(MyData$shares,na.rm = FALSE)
options(scipen=5)
#Simple data Plot shares vs number of images
plot(MyData$shares,MyData$num_imgs,xlab= "Shares", ylab="Images")
title("Simple plot of shares vs images")
#Histogram for shares
hist(MyData$shares,breaks=200,xlab= "Shares",main="Simple Histogram of Shares")
#title("Simple histogram of shares")
#Lets look at the data once again
shares_log = log(MyData$shares)
head(shares_log)
shares_log[1:10]
median(shares_log)
max(shares_log)
#Histogram for shares log
hist(shares_log,breaks=200,xlab= "Shares",ylab="Frequency",main="Simple Histogram of log(Shares)",col = "lightgreen")
curve(dnorm(x, mean=mean(shares_log), sd=sd(shares_log)), add=TRUE, col = "red", lwd=2)
#Use ggplot2 for better plots
require(ggplot2)
#Plotting the histogram
qplot(data = MyData, x = shares_log,main ="Shares Frequency" ) + ylab("Frequency") + xlab("No of Shares")
#Using density
ggplot(data=MyData, aes(x = shares_log)) + stat_density() + ylab("Density") + xlab("No of Shares")
title("Shares Frequency" )
hist(shares_log,breaks=200,freq=FALSE,xlab= "Shares",ylab="Density",main="Density Histogram of log(Shares)",col = "lightgreen")
curve(dnorm(x, mean=mean(shares_log), sd=sd(shares_log)), add=TRUE, col = "red", lwd=2)
#Plot for positive and negative polarity
qplot(data = MyData, x = shares_log, y = MyData$avg_positive_polarity,
main ="Shares vs Positive Polarity") +
ylab("Positive Polarity") + xlab("log(Shares)") + geom_point(color='blue', alpha = 0.1)
qplot(data = MyData, x = shares_log, y = MyData$avg_negative_polarity,main ="Shares vs Neg Polarity") +
ylab("Negative Polarity") + xlab("log(Shares)") + geom_point(color='blue', alpha = 0.1)
#Plot for number of images
qplot(data = MyData, x = shares_log, y = MyData$num_imgs,main ="Shares vs Images") +
ylab("Number of Images") + xlab("log(Shares)") + geom_point(color='blue', alpha = 0.1)
# Boxplot of shares
boxplot(shares_log,data=MyData, main="Shares Data Box Plot")
plot = ggplot(MyData, aes(y=shares_log),ylab="log(Share)") + geom_boxplot(color="blue",
fill="blue",
alpha = 0.2,
# custom outliers
outlier.colour="red",
outlier.fill="red",
outlier.size=3
)
plot + ggtitle("Box Plot of log(Shares)")
#Bar plot of shares
barplot(shares_log,main="log(Shares) Data")
#Frequency bar plot
freqTable= table(shares_log)
barplot(freqTable,main="log(Shares) Frequency Data",ylab="Frequency",xlab="log(Shares)")
#Bar plot shares vs videos
barplot(MyData$shares,MyData$num_videos,xlab= "Shares", ylab="Videos",main="log(Shares) vs Videos")
#Histogram of shares vs videos (Not working)
#hist(MyData$shares,MyData$num_videos)
#Pie Chart of News Categeories
slices = c(sum(MyData$data_channel_is_lifestyle),sum(MyData$data_channel_is_entertainment),sum(MyData$data_channel_is_bus),sum(MyData$data_channel_is_socmed),sum(MyData$data_channel_is_tech),sum(MyData$data_channel_is_world))
Clabels = c("Lifestyle", "Entertainment", "Bussniss", "Social Media", "Technology", "World")
percentage = round(slices/sum(slices)*100)
Clabels = paste(Clabels, percentage) # add percents to labels
Clabels = paste(Clabels,"%",sep="") # ad % to labels
pie(slices,labels = Clabels,main="Pie Chart of News Categeories")
#Bar chart of categories with shares
# lifestyle = sum(MyData$data_channel_is_lifestyle)
# entertainment = sum(MyData$data_channel_is_entertainment)
# business = sum(MyData$data_channel_is_bus)
# socialMedia = sum(MyData$data_channel_is_socmed)
# technology = sum(MyData$data_channel_is_tech)
# world = sum(MyData$data_channel_is_world)
lifestyle = 0
entertainment = 0
business = 0
socialMedia = 0
technology = 0
world = 0
#categories = c(lifestyle,entertainment,business,socialMedia,technology,world)
for(point in 1:nrow(MyData)){
if(MyData[point,"data_channel_is_lifestyle"]>0){
lifestyle = lifestyle + MyData[point,"shares"]
dfLS[i] = MyData[point,"content"]
i = i+1
}
if(MyData[point,"data_channel_is_entertainment"]>0){
entertainment = entertainment + MyData[point,"shares"]
dfET[j] = MyData[point,"content"]
j = j+1
}
if(MyData[point,"data_channel_is_bus"]>0){
business = business + MyData[point,"shares"]
dfBU[k] = MyData[point,"content"]
k = k+1
}
if(MyData[point,"data_channel_is_socmed"]>0){
socialMedia = socialMedia + MyData[point,"shares"]
dfSM[l] = MyData[point,"content"]
l = l+1
}
if(MyData[point,"data_channel_is_tech"]>0){
technology = technology + MyData[point,"shares"]
dfTC[m] = MyData[point,"content"]
m = m+1
}
if(MyData[point,"data_channel_is_world"]>0){
world = world + MyData[point,"shares"]
dfWO[n] = MyData[point,"content"]
n = n+1
}
}
#hist(dfLS,breaks=500,xlab= "Shares",ylab="Frequency",main="Simple Histogram of lLifestyle",col = "lightgreen")
#plot(dfLS)
#lifestyle = lifestyle - 843300
categories = c(lifestyle,entertainment,business,socialMedia,technology,world)
# Categories Shares bar plot in ggplot
barplot(categories, main="Categories", names.arg = Clabels)
plotFrame = data.frame(cats = Clabels, vals = categories)
ggplot(data=plotFrame, aes(x=cats,y=vals)) +
geom_bar(stat="identity",fill="steelblue")+
geom_text(aes(label=vals),vjust = -0.3,size=3.5)+
xlab("Categories") +
ylab("Total Shares") +
ggtitle("Categories Shares") +
theme_get() +
theme(plot.title = element_text(hjust = 0.5))
#Summary statistics suggested by Dr. Golen
sapply(MyData, mean, na.rm=TRUE)
library(psych)
description = describe(MyData)
#Writing data in csv
write.csv(description, file = "DataDescription.csv")