forked from info201a-s18/mini-demos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraper-demo.R
160 lines (101 loc) · 4.2 KB
/
webscraper-demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
## Let's make a webscraper!
## Sources:
## https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/
## https://www.rdocumentation.org/packages/rvest/versions/0.3.2/topics/html_nodes
## https://www.rdocumentation.org/packages/rvest/versions/0.3.2/topics/html_text
## Uncomment this to install packages
#install.packages('rvest')
# Load in 'rvest' package
library('rvest')
'Specify the URL endpoint we are using'
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
#html_nodes: More easily extract pieces out of HTML documents using XPath and css selectors
#html_text: Extract attributes, text and tag name from html.
rank_data_html <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data_html)
head(rank_data)
rank_data <- as.numeric(rank_data)
head(rank_data)
#Using CSS selectors to scrape the title section
title_data_html <- html_nodes(webpage,'.lister-item-header a')
#html to text
title_data <- html_text(title_data_html)
#look at data
head(title_data)
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#look at data
head(description_data)
#Data-Preprocessing: removing '\n'
description_data <- gsub("\n","",description_data)
#Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')
#Converting the movie runtime data to text
runtime_data <- html_text(runtime_data_html)
#Let's have a look at the movie runtime
head(runtime_data)
#Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- gsub(" min","",runtime_data)
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(rank_data)
#Converting the genre data to text
genre_data_html <- html_nodes(webpage,'.genre')
genre_data <- html_text(genre_data_html)
#Let's have a look at the genre
head(genre_data)
#Data-Preprocessing: removing \n
genre_data <- gsub("\n","",genre_data)
#Data-Preprocessing: removing excess spaces
genre_data <- gsub(" ","",genre_data)
#taking only the first genre of each movie
genre_data <- gsub(",.*","",genre_data)
#Convering each genre from text to factor
genre_data <- as.factor(genre_data)
#Let's have another look at the genre data
head(genre_data)
#Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')
#Converting the ratings data to text
rating_data <- html_text(rating_data_html)
#Let's have a look at the ratings
head(rating_data)
#Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
#Let's have another look at the ratings data
head(rating_data)
#Using CSS selectors to scrap the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')
#Converting the directors data to text
directors_data <- html_text(directors_data_html)
#Let's have a look at the directors data
head(directors_data)
#Data-Preprocessing: converting directors data into factors
directors_data <- as.factor(directors_data)
#Using CSS selectors to scrap the actors section
actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a')
#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
#Let's have a look at the actors data
head(actors_data)
#Data-Preprocessing: converting actors data into factors
actors_data <- as.factor(actors_data)
#Using CSS selectors to scrap the gross revenue section
rev_data_html <- html_nodes(webpage, '.ghost~ .text-muted+ span')
#Converting the gross revenue data to text
rev_data <- html_text(rev_data_html)
#Let's have a look at the votes data
head(rev_data)
#Data-Preprocessing: removing '$' and 'M' signs
rev_data <- gsub("M","",rev_data)
rev_data <- substring(rev_data,2,6)
#Let's check the length of gross data
length(gross_data)
#Filling missing entries with NA
#Data-Preprocessing: converting gross to numerical
#Let's have another look at the length of gross data
#library('ggplot2')
# let's draw some plots!