-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.R
134 lines (102 loc) · 4.05 KB
/
crawler.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
rm(list = ls())
library(data.table)
library(XML)
library(rvest)
error_count <<- 0
collect <- function(sublinks){
newdata <- c()
for(i in 1:length(sublinks)){
result = tryCatch({
Sahibinden <- read_html(sublinks[i])
}, warning = function(w) {
warning-handler-code
}, error = function(e) {
print(paste(e))
error_count <<- error_count + 1
})
print(paste(sublinks[i]))
attribute <- Sahibinden %>% html_nodes(".classifiedInfoList") %>% html_nodes("li") %>% html_nodes("strong") %>% html_text()
attribute <- gsub("[\t\r\n\ ]", "", attribute)
value <- Sahibinden %>% html_nodes(".classifiedInfoList") %>% html_nodes("li") %>% html_nodes("span") %>% html_text()
value <- gsub("[\t\r\n\ ]", "", value)
result = tryCatch({
newdata <- rbind(newdata, t(value))
}, warning = function(w) {
warning-handler-code
}, error = function(e) {
print(paste(e))
error_count <<- error_count + 1
})
if(i == 1){
colnames(newdata) <- attribute
}
}
return(newdata)
}
url <- "https://www.sahibinden.com/satilik"
result = tryCatch({
sahibinden <- read_html(url)
}, warning = function(w) {
warning-handler-code
}, error = function(e) {
print(paste(e))
error_count <<- error_count + 1
})
sublinks <- sahibinden %>% html_nodes(".classifiedTitle") %>% html_attr("href")
emlak_tipi <- sahibinden %>% html_nodes(".searchResultsTagAttributeValue") %>% html_text()
ilan_basligi <- sahibinden %>% html_nodes(".classifiedTitle") %>% html_text()
m2_oda <- sahibinden %>% html_nodes(".searchResultsAttributeValue") %>% html_text()
tek <- seq(1, length(m2_oda), 2)
cift <- seq(2, length(m2_oda), 2)
m2 <- m2_oda[tek]
oda <- m2_oda[cift]
ilan_tarihi <- sahibinden %>% html_nodes(".searchResultsDateValue") %>% html_text()
fiyat <- sahibinden %>% html_nodes(".searchResultsPriceValue") %>% html_text()
location <- sahibinden %>% html_nodes(".searchResultsLocationValue") %>% html_text()
pos = grep('/projeler/p', sublinks)
if(length(pos) != 0){
ilan_basligi <- ilan_basligi[-pos]
sublinks <- sublinks[-pos]
}
mydata <- data.table(cbind(emlak_tipi, ilan_basligi, m2, oda, ilan_tarihi, fiyat, location))
allSubLinks <- paste("https://www.sahibinden.com" ,sublinks,sep = "")
newAttributes <- collect(allSubLinks)
mydata <- cbind(mydata, newAttributes)
nextPage <- length(m2)
while(TRUE){
nextUrl <- "?pagingOffset="
newUrl <- paste(url, nextUrl, toString(nextPage), sep = "")
nextPage <- nextPage + length(m2)
result = tryCatch({
sahibinden <- read_html(url)
}, warning = function(w) {
warning-handler-code
}, error = function(e) {
print(paste(e))
error_count <<- error_count + 1
})
print(paste("------------------"))
print(paste(newUrl))
sublinks <- sahibinden %>% html_nodes(".classifiedTitle") %>% html_attr("href")
emlak_tipi <- sahibinden %>% html_nodes(".searchResultsTagAttributeValue") %>% html_text()
ilan_basligi <- sahibinden %>% html_nodes(".classifiedTitle") %>% html_text()
m2_oda <- sahibinden %>% html_nodes(".searchResultsAttributeValue") %>% html_text()
tek <- seq(1, length(m2_oda), 2)
cift <- seq(2, length(m2_oda), 2)
m2 <- m2_oda[tek]
oda <- m2_oda[cift]
ilan_tarihi <- sahibinden %>% html_nodes(".searchResultsDateValue") %>% html_text()
fiyat <- sahibinden %>% html_nodes(".searchResultsPriceValue") %>% html_text()
location <- sahibinden %>% html_nodes(".searchResultsLocationValue") %>% html_text()
pos = grep('/projeler/p', sublinks)
if(length(pos) != 0){
ilan_basligi <- ilan_basligi[-pos]
sublinks <- sublinks[-pos]
}
mydataNew <- data.table(cbind(emlak_tipi, ilan_basligi, m2, oda, ilan_tarihi, fiyat, location))
allSubLinks <- paste("https://www.sahibinden.com" ,sublinks,sep = "")
newAttributes <- collect(allSubLinks)
mydataNew <- cbind(mydataNew, newAttributes)
mydata <- rbind(mydata, mydataNew)
print(paste(dim(mydata), error_count))
}