Skip to content

Commit

Permalink
Refactor for v2 (#88)
Browse files Browse the repository at this point in the history
* Use bool

* Add command type

* Refactor cache

* Rename daemons type

* Rename arguments

* Rename variables

* Rename variables

* Simplify checker

* Insert a blank line

* Fix test

* Move object initialization

* Fix format

* Fix linting

* Add httpClient interface

* Fix warnings

* Remove unused variable

* Rename urlInspector

* Add robots txt fetcher

* Utilize fake http response

* Remove unused variables

* Use nil check

* Rename variable

* Rename receiver

* Define agentName

* Fix argument name

* Fix urlValidator tests

* Move library initialization code

* Rename method

* Move html.Parse

* Move redirect logic

* Rename scraper

* Rename url validator

* Fix format

* Fix method name

* Rename link fetcher

* Fold if statement

* Refactor link finder

* Fix test

* Uncomment checker test

* Enable page parser tests

* Remove fetch result tests

* Enable link finder tests

* Rename a variable

* Add comment

* Use fetch result type only for caching

* Fix link fetcher tests

* Refactor link fetcher options

* Build first

* Remove integration tests

* Remove dependencies.sh

* Use newPage

* Rename fake files

* Pass stderr to command

* Add throttled http client

* Use boxed objects

* Fix build

* Rename argument

* Fix checker tests

* Add http client factory

* Bump version
  • Loading branch information
raviqqe authored Sep 22, 2020
1 parent 8ab5412 commit b789d00
Show file tree
Hide file tree
Showing 64 changed files with 1,375 additions and 1,545 deletions.
6 changes: 2 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ jobs:
working_directory: /go/src/github.com/raviqqe/muffet
steps:
- checkout
- run: tools/dependencies.sh
- run: tools/lint.sh
- run: tools/build.sh
- run: tools/unit_test.sh
- run: tools/integration_test.sh
- run: tools/lint.sh
- run: tools/test.sh
- persist_to_workspace:
root: .
paths:
Expand Down
39 changes: 14 additions & 25 deletions arguments.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,46 +50,37 @@ type arguments struct {
OnePageOnly bool
}

func getArguments(ss []string) (arguments, error) {
args := parseArguments(usage, ss)
func getArguments(regexps []string) (arguments, error) {
args := parseArguments(usage, regexps)

b, err := parseInt(args["--buffer-size"].(string))

if err != nil {
return arguments{}, err
}

c, err := parseInt(args["--concurrency"].(string))

if err != nil {
return arguments{}, err
}

ss, _ = args["--exclude"].([]string)
ss, _ := args["--exclude"].([]string)
rs, err := compileRegexps(ss)

if err != nil {
return arguments{}, err
}

hs := map[string]string(nil)

if ss := args["--header"]; ss != nil {
hs, err = parseHeaders(ss.([]string))

if err != nil {
return arguments{}, err
}
ss, _ = args["--header"].([]string)
hs, err := parseHeaders(ss)
if err != nil {
return arguments{}, err
}

r, err := parseInt(args["--limit-redirections"].(string))

if err != nil {
return arguments{}, err
}

t, err := parseInt(args["--timeout"].(string))

if err != nil {
return arguments{}, err
}
Expand All @@ -113,8 +104,7 @@ func getArguments(ss []string) (arguments, error) {
}

func parseArguments(u string, ss []string) map[string]interface{} {
args, err := docopt.ParseArgs(u, ss, "1.3.3")

args, err := docopt.ParseArgs(u, ss, version)
if err != nil {
panic(err)
}
Expand All @@ -127,12 +117,11 @@ func parseInt(s string) (int, error) {
return int(i), err
}

func compileRegexps(ss []string) ([]*regexp.Regexp, error) {
rs := make([]*regexp.Regexp, 0, len(ss))
func compileRegexps(regexps []string) ([]*regexp.Regexp, error) {
rs := make([]*regexp.Regexp, 0, len(regexps))

for _, s := range ss {
for _, s := range regexps {
r, err := regexp.Compile(s)

if err != nil {
return nil, err
}
Expand All @@ -143,10 +132,10 @@ func compileRegexps(ss []string) ([]*regexp.Regexp, error) {
return rs, nil
}

func parseHeaders(ss []string) (map[string]string, error) {
m := make(map[string]string, len(ss))
func parseHeaders(headers []string) (map[string]string, error) {
m := make(map[string]string, len(headers))

for _, s := range ss {
for _, s := range headers {
i := strings.IndexRune(s, ':')

if i < 0 {
Expand Down
16 changes: 8 additions & 8 deletions cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ func newCache() cache {
return cache{&sync.Map{}, &sync.Map{}}
}

func (c cache) LoadOrStore(s string) (interface{}, func(interface{}), bool) {
if x, ok := c.values.Load(s); ok {
return x, nil, true
func (c cache) LoadOrStore(key string) (interface{}, func(interface{})) {
if x, ok := c.values.Load(key); ok {
return x, nil
}

g := &sync.WaitGroup{}
g.Add(1)

if g, ok := c.locks.LoadOrStore(s, g); ok {
if g, ok := c.locks.LoadOrStore(key, g); ok {
g.(*sync.WaitGroup).Wait()
x, _ := c.values.Load(s)
x, _ := c.values.Load(key)

return x, nil, true
return x, nil
}

return nil, func(x interface{}) {
c.values.Store(s, x)
c.values.Store(key, x)
g.Done()
}, false
}
}
13 changes: 5 additions & 8 deletions cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,17 @@ func TestNewCache(t *testing.T) {
func TestCacheLoadOrStore(t *testing.T) {
c := newCache()

x, f, ok := c.LoadOrStore("https://foo.com")
x, f := c.LoadOrStore("https://foo.com")

assert.Nil(t, x)
assert.NotNil(t, f)
assert.False(t, ok)

f(42)

x, f, ok = c.LoadOrStore("https://foo.com")
x, f = c.LoadOrStore("https://foo.com")

assert.Equal(t, 42, x)
assert.Nil(t, f)
assert.True(t, ok)
}

func TestCacheLoadOrStoreConcurrency(t *testing.T) {
Expand All @@ -40,9 +38,9 @@ func TestCacheLoadOrStoreConcurrency(t *testing.T) {
g.Add(1)

go func() {
x, f, ok := c.LoadOrStore("https://foo.com")
x, f := c.LoadOrStore("https://foo.com")

if ok {
if f == nil {
assert.Equal(t, 42, x)
atomic.AddInt32(&l, 1)
} else {
Expand All @@ -65,8 +63,7 @@ func BenchmarkCacheLoadOrStore(b *testing.B) {
c := newCache()
g := &sync.WaitGroup{}

_, f, ok := c.LoadOrStore("https://foo.com")
assert.False(b, ok)
_, f := c.LoadOrStore("https://foo.com")
f(42)

b.ResetTimer()
Expand Down
88 changes: 28 additions & 60 deletions checker.go
Original file line number Diff line number Diff line change
@@ -1,75 +1,43 @@
package main

import (
"crypto/tls"
"errors"
"sync"

"github.com/fatih/color"
"github.com/valyala/fasthttp"
)

type checker struct {
fetcher
daemons daemons
urlInspector urlInspector
results chan pageResult
donePages concurrentStringSet
fetcher *linkFetcher
linkValidator *linkValidator
daemonManager *daemonManager
results chan pageResult
donePages concurrentStringSet
onePageOnly bool
}

func newChecker(s string, o checkerOptions) (checker, error) {
o.Initialize()

c := &fasthttp.Client{
MaxConnsPerHost: o.Concurrency,
ReadBufferSize: o.BufferSize,
TLSConfig: &tls.Config{
InsecureSkipVerify: o.SkipTLSVerification,
},
}
f := newFetcher(c, o.fetcherOptions)
r, err := f.Fetch(s)

if err != nil {
return checker{}, err
}

p, ok := r.Page()

if !ok {
return checker{}, errors.New("non-HTML page")
}

ui, err := newURLInspector(c, p.URL().String(), o.FollowRobotsTxt, o.FollowSitemapXML)

if err != nil {
return checker{}, err
}

ch := checker{
func newChecker(f *linkFetcher, v *linkValidator, concurrency int, onePageOnly bool) *checker {
return &checker{
f,
newDaemons(o.Concurrency),
ui,
make(chan pageResult, o.Concurrency),
v,
newDaemonManager(concurrency),
make(chan pageResult, concurrency),
newConcurrentStringSet(),
onePageOnly,
}

ch.addPage(p)

return ch, nil
}

func (c checker) Results() <-chan pageResult {
func (c *checker) Results() <-chan pageResult {
return c.results
}

func (c checker) Check() {
c.daemons.Run()
func (c *checker) Check(page *page) {
c.addPage(page)
c.daemonManager.Run()

close(c.results)
}

func (c checker) checkPage(p *page) {
func (c *checker) checkPage(p *page) {
us := p.Links()

sc := make(chan string, len(us))
Expand All @@ -87,39 +55,39 @@ func (c checker) checkPage(p *page) {
go func(u string) {
defer w.Done()

r, err := c.fetcher.Fetch(u)
status, p, err := c.fetcher.Fetch(u)

if err == nil {
sc <- formatLinkSuccess(u, r.StatusCode())
sc <- formatLinkSuccess(u, status)
} else {
ec <- formatLinkError(u, err)
}

// only consider adding the page to the list if we're recursing
if !c.fetcher.options.OnePageOnly {
if p, ok := r.Page(); ok && c.urlInspector.Inspect(p.URL()) {
c.addPage(p)
}
if !c.onePageOnly && p != nil && c.linkValidator.Validate(p.URL()) {
c.addPage(p)
}
}(u)
}

w.Wait()

close(sc)
close(ec)

c.results <- newPageResult(p.URL().String(), stringChannelToSlice(sc), stringChannelToSlice(ec))
}

func (c checker) addPage(p *page) {
func (c *checker) addPage(p *page) {
if !c.donePages.Add(p.URL().String()) {
c.daemons.Add(func() { c.checkPage(p) })
c.daemonManager.Add(func() { c.checkPage(p) })
}
}

func stringChannelToSlice(sc <-chan string) []string {
ss := make([]string, 0, len(sc))

for i := 0; i < cap(ss); i++ {
ss = append(ss, <-sc)
for s := range sc {
ss = append(ss, s)
}

return ss
Expand Down
10 changes: 0 additions & 10 deletions checker_option.go

This file was deleted.

14 changes: 0 additions & 14 deletions checker_option_test.go

This file was deleted.

Loading

0 comments on commit b789d00

Please sign in to comment.