forked from PuerkitoBio/gocrawl
-
Notifications
You must be signed in to change notification settings - Fork 1
/
examples_test.go
59 lines (45 loc) · 1.82 KB
/
examples_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
package gocrawl_test
import (
"net/http"
"regexp"
"time"
"github.com/PuerkitoBio/gocrawl"
"github.com/PuerkitoBio/goquery"
)
// Only enqueue the root and paths beginning with an "a"
var rxOk = regexp.MustCompile(`http://duckduckgo\.com(/a.*)?$`)
// Create the Extender implementation, based on the gocrawl-provided DefaultExtender,
// because we don't want/need to override all methods.
type ExampleExtender struct {
gocrawl.DefaultExtender // Will use the default implementation of all but Visit and Filter
}
// Override Visit for our need.
func (x *ExampleExtender) Visit(ctx *gocrawl.URLContext, res *http.Response, doc *goquery.Document) (interface{}, bool) {
// Use the goquery document or res.Body to manipulate the data
// ...
// Return nil and true - let gocrawl find the links
return nil, true
}
// Override Filter for our need.
func (x *ExampleExtender) Filter(ctx *gocrawl.URLContext, isVisited bool) bool {
return !isVisited && rxOk.MatchString(ctx.NormalizedURL().String())
}
func ExampleCrawl() {
// Set custom options
opts := gocrawl.NewOptions(new(ExampleExtender))
// should always set your robot name so that it looks for the most
// specific rules possible in robots.txt.
opts.RobotUserAgent = "Example"
// and reflect that in the user-agent string used to make requests,
// ideally with a link so site owners can contact you if there's an issue
opts.UserAgent = "Mozilla/5.0 (compatible; Example/1.0; +http://example.com)"
opts.CrawlDelay = 1 * time.Second
opts.LogFlags = gocrawl.LogAll
// Play nice with ddgo when running the test!
opts.MaxVisits = 2
// Create crawler and start at root of duckduckgo
c := gocrawl.NewCrawlerWithOptions(opts)
c.Run("https://duckduckgo.com/")
// Remove "x" before Output: to activate the example (will run on go test)
// xOutput: voluntarily fail to see log output
}