diff --git a/colly_test.go b/colly_test.go index 4358b63e..ee54e0da 100644 --- a/colly_test.go +++ b/colly_test.go @@ -17,6 +17,7 @@ package colly import ( "bufio" "bytes" + "compress/gzip" "context" "errors" "fmt" @@ -43,6 +44,15 @@ Disallow: /disallowed Disallow: /allowed*q= ` +const testXML = ` + + Test Page + This is a test page + This is a test paragraph +` + +const custom404 = `404 not found` + func newUnstartedTestServer() *httptest.Server { mux := http.NewServeMux() @@ -69,13 +79,21 @@ func newUnstartedTestServer() *httptest.Server { mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/xml") - w.Write([]byte(` - - Test Page - This is a test page - This is a test paragraph - - `)) + w.Write([]byte(testXML)) + }) + + mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) { + ww := gzip.NewWriter(w) + defer ww.Close() + ww.Write([]byte(testXML)) + }) + + mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) { + http.Error(w, custom404, http.StatusNotFound) + }) + + mux.HandleFunc("/empty-response.xml.gz", func(w http.ResponseWriter, r *http.Request) { + // write nothing }) mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { @@ -1417,7 +1435,7 @@ func TestCollectorOnXMLWithHtml(t *testing.T) { } } -func TestCollectorOnXMLWithXML(t *testing.T) { +func testCollectorOnXMLWithXML(t *testing.T, path string) { ts := newTestServer() defer ts.Close() @@ -1450,7 +1468,7 @@ func TestCollectorOnXMLWithXML(t *testing.T) { } }) - c.Visit(ts.URL + "/xml") + c.Visit(ts.URL + path) if !titleCallbackCalled { t.Error("Failed to call OnXML callback for tag") @@ -1461,6 +1479,72 @@ func TestCollectorOnXMLWithXML(t *testing.T) { } } +func TestCollectorOnXMLWithXML(t *testing.T) { + testCollectorOnXMLWithXML(t, "/xml") +} + +func TestCollectorOnXMLWithXMLCompressed(t *testing.T) { + testCollectorOnXMLWithXML(t, "/test.xml.gz") +} + +func TestCollectorNonexistentXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector(ParseHTTPErrorResponse()) + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/nonexistent.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + +func TestCollectorEmptyXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), ""; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/empty-response.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + func TestCollectorVisitWithTrace(t *testing.T) { ts := newTestServer() defer ts.Close() diff --git a/http_backend.go b/http_backend.go index e580f7a2..01c6832c 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,6 +15,7 @@ package colly import ( + "bufio" "crypto/sha1" "encoding/gob" "encoding/hex" @@ -202,11 +203,27 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c } contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { - bodyReader, err = gzip.NewReader(bodyReader) - if err != nil { + // Even if URL contains .xml.gz, it doesn't mean that we get gzip + // compressed data back. We might get 404 error page instead, + // for example. So check gzip magic bytes. + bufReader := bufio.NewReader(bodyReader) + bodyReader = bufReader + magic, err := bufReader.Peek(2) + switch err { + case io.EOF: + // less than 2 bytes, do nothing + case nil: + // gzip magic, as specified in RFC 1952 + if magic[0] == 0x1f && magic[1] == 0x8b { + bodyReader, err = gzip.NewReader(bufReader) + if err != nil { + return nil, err + } + defer bodyReader.(*gzip.Reader).Close() + } + default: return nil, err } - defer bodyReader.(*gzip.Reader).Close() } body, err := io.ReadAll(bodyReader) if err != nil {