-
Notifications
You must be signed in to change notification settings - Fork 21
/
master_web_scrape_script.R
109 lines (91 loc) · 3.75 KB
/
master_web_scrape_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# install.packages("rvest")
library(rvest)
library(stringr)
#################################################################################
# ingress
#################################################################################
# scrape date, now
now <- Sys.time()
# url to scrape, then download page
url <- "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38"
webpage <- read_html(url)
#################################################################################
# web scraping
#################################################################################
############
# feature: card name
############
card_name <- webpage %>% html_nodes(".item-title") %>% html_text()
################
# feature: current price
################
cur_price <- webpage %>% html_nodes(".price-current strong") %>% html_text()
################
# feature: original price
################
org_price <- webpage %>% html_nodes(".price-was") %>% html_text(trim=TRUE)
# substring search for price, using regular expression.
needle <- "\\d{1,}\\.\\d{1,}"
indexes <- str_locate(string = org_price, pattern = needle)
indexes <- as.data.frame(indexes)
org_price <- str_sub(string=org_price, start = indexes$start, end = indexes$end)
################
# feature: rating
################
# problem: not every graphics card has a rating
# solution: build a table of product id and ratings
# then join with the main table by the same product id
# product id
rate.pid <- webpage %>% html_nodes(".item-rating") %>% html_attr("href")
# format: <url><"Item='><pid><'$'><stuff>
rate.pid.split <- str_split_fixed(rate.pid, pattern = "Item=", n=2)
# result: [1] [2]
# <url> <pid><'$'><stuff>
rate.pid.split <- str_split_fixed(rate.pid.split[,2], pattern="&", n=2)
# result: [1] [2]
# <pid> <stuff>
rate.pid <- rate.pid.split[,1]
# rating
rating <- webpage %>% html_nodes(".item-rating") %>% html_attr("title")
# result: <string><+\s><rating>
rating <- str_split_fixed(string = rating, pattern="\\+\\s", n = 2)[,2]
# result: [1] [2]
# <string\s> <rating>
rating_df <- as.data.frame(cbind(rate.pid, rating))
# combine
#################################################################################
# data binding
#################################################################################
graphics_cards <- as.data.frame(card_name)
graphics_cards$scrape_date <- now
graphics_cards$cur_price <- current_price
graphics_cards$org_price <- org_price
graphics_cards$rating <- rating
#######################
# feature: sales price
#######################
# logic: sales price - current price = sales discount
# pseudo code: replace NA of org price, with the current price
# query org missing prices <- query cur prices of org missing prices
na.org_price <- is.na(graphics_cards$org_price)
graphics_cards[na.org_price,"org_price"] <- graphics_cards[na.org_price,"cur_price"]
# cast into numeric
graphics_cards$org_price <- as.numeric(graphics_cards$org_price)
graphics_cards$cur_price <- as.numeric(graphics_cards$cur_price)
# sales price - current price = sales discount
graphics_cards$sales_amt <- graphics_cards$org_price - graphics_cards$cur_price
#######################
# feature: discount %
#######################
# logic: divide sales amount by original price
graphics_cards$discount <- graphics_cards$sales_amt / graphics_cards$org_price
#######################
# feature: on_sale
#######################
# logic: if discount price as a percentage of the original price is higher than
# a certain percentage threshold, mark as being on sale
# key: 0 = not on sale
# 1 = on sale
threshold <- 0.03
graphics_cards$on_sale <- 0
graphics_cards[graphics_cards$discount > threshold, "on_sale"] <- 1