Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make accepted HTTP response status codes configurable #364

Merged
merged 2 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .snapshots/TestHelp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ Usage:
muffet.test [options] <url>

Application Options:
--accepted-status-codes=<codes> Accepted HTTP response status codes
(default: 200..300)
-b, --buffer-size=<size> HTTP response buffer size in bytes
(default: 4096)
-c, --max-connections=<count> Maximum number of HTTP connections
Expand Down
25 changes: 16 additions & 9 deletions arguments.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@ import (
)

type arguments struct {
BufferSize int `short:"b" long:"buffer-size" value-name:"<size>" default:"4096" description:"HTTP response buffer size in bytes"`
MaxConnections int `short:"c" long:"max-connections" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections"`
MaxConnectionsPerHost int `long:"max-connections-per-host" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections per host"`
MaxResponseBodySize int `long:"max-response-body-size" value-name:"<size>" default:"10000000" description:"Maximum response body size to read"`
RawExcludedPatterns []string `short:"e" long:"exclude" value-name:"<pattern>..." description:"Exclude URLs matched with given regular expressions"`
RawIncludedPatterns []string `short:"i" long:"include" value-name:"<pattern>..." description:"Include URLs matched with given regular expressions"`
FollowRobotsTxt bool `long:"follow-robots-txt" description:"Follow robots.txt when scraping pages"`
FollowSitemapXML bool `long:"follow-sitemap-xml" description:"Scrape only pages listed in sitemap.xml (deprecated)"`
RawHeaders []string `long:"header" value-name:"<header>..." description:"Custom headers"`
RawAcceptedStatusCodes string `long:"accepted-status-codes" value-name:"<codes>" default:"200..300" description:"Accepted HTTP response status codes"`
BufferSize int `short:"b" long:"buffer-size" value-name:"<size>" default:"4096" description:"HTTP response buffer size in bytes"`
MaxConnections int `short:"c" long:"max-connections" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections"`
MaxConnectionsPerHost int `long:"max-connections-per-host" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections per host"`
MaxResponseBodySize int `long:"max-response-body-size" value-name:"<size>" default:"10000000" description:"Maximum response body size to read"`
RawExcludedPatterns []string `short:"e" long:"exclude" value-name:"<pattern>..." description:"Exclude URLs matched with given regular expressions"`
RawIncludedPatterns []string `short:"i" long:"include" value-name:"<pattern>..." description:"Include URLs matched with given regular expressions"`
FollowRobotsTxt bool `long:"follow-robots-txt" description:"Follow robots.txt when scraping pages"`
FollowSitemapXML bool `long:"follow-sitemap-xml" description:"Scrape only pages listed in sitemap.xml (deprecated)"`
RawHeaders []string `long:"header" value-name:"<header>..." description:"Custom headers"`
// TODO Remove a short option.
IgnoreFragments bool `short:"f" long:"ignore-fragments" description:"Ignore URL fragments"`
Format string `long:"format" description:"Output format" default:"text" choice:"text" choice:"json" choice:"junit"`
Expand All @@ -40,6 +41,7 @@ type arguments struct {
Help bool `short:"h" long:"help" description:"Show this help"`
Version bool `long:"version" description:"Show version"`
URL string
AcceptedStatusCodes statusCodeCollection
ExcludedPatterns []*regexp.Regexp
IncludePatterns []*regexp.Regexp
Header http.Header
Expand Down Expand Up @@ -76,6 +78,11 @@ func getArguments(ss []string) (*arguments, error) {
return nil, err
}

args.AcceptedStatusCodes, err = parseStatusCodeCollection(args.RawAcceptedStatusCodes)
if err != nil {
return nil, err
}

if args.Format == "junit" && args.Verbose {
return nil, errors.New("verbose option not supported for JUnit output")
}
Expand Down
2 changes: 2 additions & 0 deletions arguments_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
func TestGetArguments(t *testing.T) {
for _, ss := range [][]string{
{"https://foo.com"},
{"--accepted-status-codes", "200..300,403", "https://foo.com"},
{"-b", "42", "https://foo.com"},
{"--buffer-size", "42", "https://foo.com"},
{"-c", "1", "https://foo.com"},
Expand Down Expand Up @@ -48,6 +49,7 @@ func TestGetArguments(t *testing.T) {
func TestGetArgumentsError(t *testing.T) {
for _, ss := range [][]string{
{},
{"--accepted-status-codes", "foo", "https://foo.com"},
{"-b", "foo", "https://foo.com"},
{"--buffer-size", "foo", "https://foo.com"},
{"-c", "foo", "https://foo.com"},
Expand Down
1 change: 1 addition & 0 deletions command.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func (c *command) runWithError(ss []string) (bool, error) {
args.MaxConnectionsPerHost,
),
args.MaxRedirections,
args.AcceptedStatusCodes,
)

fl := newLinkFilterer(args.ExcludedPatterns, args.IncludePatterns)
Expand Down
20 changes: 11 additions & 9 deletions redirect_http_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ import (
)

type redirectHttpClient struct {
client httpClient
maxRedirections int
client httpClient
maxRedirections int
acceptedStatusCodes statusCodeCollection
}

func newRedirectHttpClient(c httpClient, maxRedirections int) httpClient {
return &redirectHttpClient{c, maxRedirections}
func newRedirectHttpClient(c httpClient, maxRedirections int, acceptedStatusCodes statusCodeCollection) httpClient {
return &redirectHttpClient{c, maxRedirections, acceptedStatusCodes}
}

func (c *redirectHttpClient) Get(u *url.URL, header http.Header) (httpResponse, error) {
Expand All @@ -40,10 +41,11 @@ func (c *redirectHttpClient) Get(u *url.URL, header http.Header) (httpResponse,
return nil, c.formatError(err, i, u)
}

switch r.StatusCode() / 100 {
case 2:
code := r.StatusCode()

if c.acceptedStatusCodes.isInCollection(code) {
return r, nil
case 3:
} else if code >= 300 && code <= 399 {
i++

if i > c.maxRedirections {
Expand All @@ -63,8 +65,8 @@ func (c *redirectHttpClient) Get(u *url.URL, header http.Header) (httpResponse,
}

cj.SetCookies(u, parseCookies(r.Header("set-cookie")))
default:
return nil, c.formatError(fmt.Errorf("%v", r.StatusCode()), i, u)
} else {
return nil, c.formatError(fmt.Errorf("%v", code), i, u)
}
}
}
Expand Down
13 changes: 12 additions & 1 deletion redirect_http_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ import (

const testUrl = "http://foo.com"

var acceptedStatusCodes = statusCodeCollection{[]statusCodeRange{{200, 300}}}

func TestNewRedirectHttpClient(t *testing.T) {
newRedirectHttpClient(newFakeHttpClient(nil), 42)
newRedirectHttpClient(newFakeHttpClient(nil), 42, acceptedStatusCodes)
}

func TestRedirectHttpClientGet(t *testing.T) {
Expand All @@ -30,6 +32,7 @@ func TestRedirectHttpClientGet(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, err)
Expand Down Expand Up @@ -62,6 +65,7 @@ func TestRedirectHttpClientGetWithRedirect(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, err)
Expand Down Expand Up @@ -96,6 +100,7 @@ func TestRedirectHttpClientGetWithRedirects(t *testing.T) {
},
),
maxRedirections,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, err)
Expand Down Expand Up @@ -134,6 +139,7 @@ func TestRedirectHttpClientGetWithRelativeRedirect(t *testing.T) {
},
),
maxRedirections,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, err)
Expand Down Expand Up @@ -163,6 +169,7 @@ func TestRedirectHttpClientFailWithTooManyRedirects(t *testing.T) {
},
),
maxRedirections,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, r)
Expand All @@ -182,6 +189,7 @@ func TestRedirectHttpClientFailWithUnsetLocationHeader(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, r)
Expand All @@ -205,6 +213,7 @@ func TestRedirectHttpClientFailWithInvalidLocationURL(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, r)
Expand All @@ -223,6 +232,7 @@ func TestRedirectHttpClientFailWithInvalidStatusCode(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, r)
Expand Down Expand Up @@ -253,6 +263,7 @@ func TestRedirectHttpClientFailAfterRedirect(t *testing.T) {
},
),
42,
acceptedStatusCodes,
).Get(u, nil)

assert.Nil(t, r)
Expand Down
41 changes: 41 additions & 0 deletions status_code_collection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package main

import "strings"

type statusCodeCollection struct {
elements []statusCodeRange
}

func parseStatusCodeCollection(value string) (statusCodeCollection, error) {
statusCodeRanges := []statusCodeRange{}

for _, partial := range strings.Split(value, ",") {
if len(value) == 0 {
continue
}

statusCodeRange, err := parseStatusCodeRange(partial)

if err != nil {
return statusCodeCollection{}, err
}

statusCodeRanges = append(statusCodeRanges, *statusCodeRange)
}

if len(statusCodeRanges) == 0 {
statusCodeRanges = append(statusCodeRanges, statusCodeRange{200, 300})
}

return statusCodeCollection{statusCodeRanges}, nil
}

func (c *statusCodeCollection) isInCollection(code int) bool {
for _, element := range c.elements {
if element.isInRange(code) {
return true
}
}

return false
}
47 changes: 47 additions & 0 deletions status_code_collection_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package main

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestParsingEmptyStatusCodeCollection(t *testing.T) {
collection, err := parseStatusCodeCollection("")

assert.Nil(t, err)

assert.False(t, collection.isInCollection(199))
assert.True(t, collection.isInCollection(200))
assert.True(t, collection.isInCollection(201))

assert.True(t, collection.isInCollection(298))
assert.True(t, collection.isInCollection(299))
assert.False(t, collection.isInCollection(300))
}

func TestParsingValidStatusCodeCollection(t *testing.T) {
collection, err := parseStatusCodeCollection("200..207,403")

assert.Nil(t, err)

assert.False(t, collection.isInCollection(199))
assert.True(t, collection.isInCollection(200))
assert.True(t, collection.isInCollection(201))

assert.True(t, collection.isInCollection(205))
assert.True(t, collection.isInCollection(206))
assert.False(t, collection.isInCollection(207))

assert.False(t, collection.isInCollection(402))
assert.True(t, collection.isInCollection(403))
assert.False(t, collection.isInCollection(404))
}

func TestParsingInvalidStatusCodeCollection(t *testing.T) {
collection, err := parseStatusCodeCollection("200,foo")

assert.NotNil(t, err)

assert.NotNil(t, collection)
assert.NotNil(t, collection.isInCollection(200))
}
36 changes: 36 additions & 0 deletions status_code_range.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package main

import (
"errors"
"regexp"
"strconv"
)

var fixedCodePattern = regexp.MustCompile(`^\s*(\d{3})\s*$`)
var rangeCodePattern = regexp.MustCompile(`^\s*(\d{3})\s*\.\.\s*(\d{3})\s*$`)

type statusCodeRange struct {
start int
end int
}

func parseStatusCodeRange(value string) (*statusCodeRange, error) {
fixedMatch := fixedCodePattern.FindAllStringSubmatch(value, -1)
if len(fixedMatch) > 0 {
code, _ := strconv.Atoi(fixedMatch[0][1])
return &statusCodeRange{code, code + 1}, nil
}

rangeMatch := rangeCodePattern.FindAllStringSubmatch(value, -1)
if len(rangeMatch) > 0 {
start, _ := strconv.Atoi(rangeMatch[0][1])
end, _ := strconv.Atoi(rangeMatch[0][2])
return &statusCodeRange{start, end}, nil
}

return nil, errors.New("invalid HTTP response status code value")
}

func (r *statusCodeRange) isInRange(code int) bool {
return code >= r.start && code < r.end
}
41 changes: 41 additions & 0 deletions status_code_range_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package main

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestParsingFixedStatusCode(t *testing.T) {
code, err := parseStatusCodeRange("403")

assert.Nil(t, err)
assert.Equal(t, 403, code.start)
assert.Equal(t, 404, code.end)
}

func TestParsingStatusCodeRange(t *testing.T) {
code, err := parseStatusCodeRange("200..300")

assert.Nil(t, err)
assert.Equal(t, 200, code.start)
assert.Equal(t, 300, code.end)
}

func TestParsingInvalidStatusCode(t *testing.T) {
code, err := parseStatusCodeRange("foo")

assert.NotNil(t, err)
assert.Nil(t, code)
}

func TestInRangeOfStatusCode(t *testing.T) {
code := statusCodeRange{200, 300}

assert.False(t, code.isInRange(199))
assert.True(t, code.isInRange(200))
assert.True(t, code.isInRange(201))

assert.True(t, code.isInRange(298))
assert.True(t, code.isInRange(299))
assert.False(t, code.isInRange(300))
}
Loading