From 1b000c4d4ee24b4acf6c077de6c1ce2385a5dd6e Mon Sep 17 00:00:00 2001 From: WGH Date: Thu, 5 Jan 2023 21:52:12 +0300 Subject: [PATCH 1/2] Add tests for automated .gz decompression This is not about Content-Encoding: gzip, which is undone automatically by Go net/http. This is about files that are compressed as is, e.g. /sitemap.xml.gz, without Content-Encoding: gzip being sent. --- colly_test.go | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/colly_test.go b/colly_test.go index 4358b63e..5d8de590 100644 --- a/colly_test.go +++ b/colly_test.go @@ -17,6 +17,7 @@ package colly import ( "bufio" "bytes" + "compress/gzip" "context" "errors" "fmt" @@ -43,6 +44,13 @@ Disallow: /disallowed Disallow: /allowed*q= ` +const testXML = ` + + Test Page + This is a test page + This is a test paragraph +` + func newUnstartedTestServer() *httptest.Server { mux := http.NewServeMux() @@ -69,13 +77,13 @@ func newUnstartedTestServer() *httptest.Server { mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/xml") - w.Write([]byte(` - - Test Page - This is a test page - This is a test paragraph - - `)) + w.Write([]byte(testXML)) + }) + + mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) { + ww := gzip.NewWriter(w) + defer ww.Close() + ww.Write([]byte(testXML)) }) mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { @@ -1417,7 +1425,7 @@ func TestCollectorOnXMLWithHtml(t *testing.T) { } } -func TestCollectorOnXMLWithXML(t *testing.T) { +func testCollectorOnXMLWithXML(t *testing.T, path string) { ts := newTestServer() defer ts.Close() @@ -1450,7 +1458,7 @@ func TestCollectorOnXMLWithXML(t *testing.T) { } }) - c.Visit(ts.URL + "/xml") + c.Visit(ts.URL + path) if !titleCallbackCalled { t.Error("Failed to call OnXML callback for tag") @@ -1461,6 +1469,14 @@ func TestCollectorOnXMLWithXML(t *testing.T) { } } +func TestCollectorOnXMLWithXML(t *testing.T) { + testCollectorOnXMLWithXML(t, "/xml") +} + +func TestCollectorOnXMLWithXMLCompressed(t *testing.T) { + testCollectorOnXMLWithXML(t, "/test.xml.gz") +} + func TestCollectorVisitWithTrace(t *testing.T) { ts := newTestServer() defer ts.Close() From d5f1ff80fe87f91c76f0efd0960b2ba7d19cbccd Mon Sep 17 00:00:00 2001 From: WGH <wgh@torlan.ru> Date: Wed, 28 Dec 2022 17:09:26 +0300 Subject: [PATCH 2/2] Don't decompress gzip if data doesn't look like gzip Prevents incorrect response being returned in cases like /sitemap.xml.gz is requested, but uncompressed 404 page is served instead. Thanks-to: Seth Davis <seth@xyplanningnetwork.com> --- colly_test.go | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ http_backend.go | 23 ++++++++++++++--- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/colly_test.go b/colly_test.go index 5d8de590..ee54e0da 100644 --- a/colly_test.go +++ b/colly_test.go @@ -51,6 +51,8 @@ const testXML = `<?xml version="1.0" encoding="UTF-8"?> <paragraph type="description">This is a test paragraph</paragraph> </page>` +const custom404 = `404 not found` + func newUnstartedTestServer() *httptest.Server { mux := http.NewServeMux() @@ -86,6 +88,14 @@ func newUnstartedTestServer() *httptest.Server { ww.Write([]byte(testXML)) }) + mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) { + http.Error(w, custom404, http.StatusNotFound) + }) + + mux.HandleFunc("/empty-response.xml.gz", func(w http.ResponseWriter, r *http.Request) { + // write nothing + }) + mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { if r.Method == "POST" { w.Header().Set("Content-Type", "text/html") @@ -1477,6 +1487,64 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) { testCollectorOnXMLWithXML(t, "/test.xml.gz") } +func TestCollectorNonexistentXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector(ParseHTTPErrorResponse()) + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/nonexistent.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + +func TestCollectorEmptyXMLGZ(t *testing.T) { + // This is a regression test for colly + // attempting to decompress all .xml.gz URLs + // even if they're not compressed. + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + onResponseCalled := false + + c.OnResponse(func(resp *Response) { + onResponseCalled = true + if got, want := strings.TrimSpace(string(resp.Body)), ""; got != want { + t.Errorf("wrong response body got=%q want=%q", got, want) + } + }) + + c.OnError(func(resp *Response, err error) { + t.Errorf("called on OnError: err=%v", err) + }) + + c.Visit(ts.URL + "/empty-response.xml.gz") + + if !onResponseCalled { + t.Error("OnResponse was not called") + } +} + func TestCollectorVisitWithTrace(t *testing.T) { ts := newTestServer() defer ts.Close() diff --git a/http_backend.go b/http_backend.go index e580f7a2..01c6832c 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,6 +15,7 @@ package colly import ( + "bufio" "crypto/sha1" "encoding/gob" "encoding/hex" @@ -202,11 +203,27 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c } contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { - bodyReader, err = gzip.NewReader(bodyReader) - if err != nil { + // Even if URL contains .xml.gz, it doesn't mean that we get gzip + // compressed data back. We might get 404 error page instead, + // for example. So check gzip magic bytes. + bufReader := bufio.NewReader(bodyReader) + bodyReader = bufReader + magic, err := bufReader.Peek(2) + switch err { + case io.EOF: + // less than 2 bytes, do nothing + case nil: + // gzip magic, as specified in RFC 1952 + if magic[0] == 0x1f && magic[1] == 0x8b { + bodyReader, err = gzip.NewReader(bufReader) + if err != nil { + return nil, err + } + defer bodyReader.(*gzip.Reader).Close() + } + default: return nil, err } - defer bodyReader.(*gzip.Reader).Close() } body, err := io.ReadAll(bodyReader) if err != nil {