-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.go
150 lines (121 loc) · 3.41 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
package duo
import (
"fmt"
"os"
"time"
"github.com/gocolly/colly"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
// some emojis to show the scraping progress
const E_BLACK = "\u2B1B"
const E_GREEN = "\U0001F7E9"
const E_RED = "\U0001F7E5"
// initializes the logger
func init() {
// changing the default json output to a more human-readable format
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
log.Info().Msg("DuoLingo Scraper")
}
// this is wrapper for a typical scraper with retry, timeout and parallelism
// you should provide a proper CollyConfigs instance and call the Scrape()
func (cfg *CollyConfigs) Scrape() {
// enable async. collecting by default
c := colly.NewCollector(colly.Async(true))
c.Limit(&colly.LimitRule{Parallelism: cfg.Parallelism})
if cfg.DomainGlob != "" {
c.Limit(&colly.LimitRule{DomainGlob: cfg.DomainGlob})
} else {
c.Limit(&colly.LimitRule{DomainGlob: "*"})
}
// runs this function before everything else
if cfg.OnInit != nil {
cfg.OnInit()
}
c.SetRequestTimeout(time.Millisecond * time.Duration(cfg.Timeout))
// progress bar init.
for range cfg.URLS {
fmt.Print(E_BLACK)
}
fmt.Print("\r")
// here we keep the progress by adding the OK or ERROR status
progress := make(chan Error, len(cfg.URLS))
// assigning the OnHTML callbacks and their corresponding quert strings
for tag, f := range cfg.OnHTML {
c.OnHTML(tag, f)
}
c.OnRequest(func(r *colly.Request) {
// on each request, we add the key-value pair to the colly context
// to keep track of the retries has made
// here we initialize the value to 0
if cfg.Retry > 0 {
id := r.URL.String()
if r.Ctx.GetAny(id) == nil {
r.Ctx.Put(id, 0)
}
}
// null-check before calling the function
if cfg.OnRequest != nil {
cfg.OnRequest(r)
}
})
c.OnResponse(func(r *colly.Response) {
// receiving the response means that we don't have any errors
// note: OnHTML is being called after OnResponse!
progress <- Error{E: nil, Where: r.Request.URL.String()}
if cfg.OnResponse != nil {
cfg.OnResponse(r)
}
})
c.OnError(func(r *colly.Response, err error) {
// on error, check the retry count, retry if the retry limit is not reached yet,
// adds an error if the limit is reached
if cfg.Retry > 0 {
// getting the current retry count from the context
id := r.Request.URL.String()
ret := r.Ctx.GetAny(id)
if ret != nil {
retry := ret.(int)
if retry < cfg.Retry {
r.Request.Retry()
} else {
progress <- Error{E: err, Where: r.Request.URL.String()}
}
retry++
r.Ctx.Put(id, retry)
}
}
if cfg.OnError != nil {
cfg.OnError(r, err)
}
})
// visiting the urls
for _, url := range cfg.URLS {
if err := c.Visit(url); err != nil {
log.Error().Msg(fmt.Sprintf("error visiting %s, error: %v"+url, err))
}
}
// a slice to gather the values inside the progress channel
errors := make([]Error, 0, len(cfg.URLS))
// waiting for the colly to gather everything and showing the progress
for range cfg.URLS {
state := <-progress
if state.E == nil {
fmt.Print(E_GREEN)
} else {
fmt.Print(E_RED)
errors = append(errors, state)
}
}
c.Wait()
fmt.Println()
// log the existing errors
for _, e := range errors {
log.Error().Msg(fmt.Sprintf("Error in %s, error: %v", e.Where, e.E))
}
// running the finally callback
if cfg.Finally != nil {
cfg.Finally()
}
}