diff --git a/colly_test.go b/colly_test.go
index 4358b63e..ee54e0da 100644
--- a/colly_test.go
+++ b/colly_test.go
@@ -17,6 +17,7 @@ package colly
import (
"bufio"
"bytes"
+ "compress/gzip"
"context"
"errors"
"fmt"
@@ -43,6 +44,15 @@ Disallow: /disallowed
Disallow: /allowed*q=
`
+const testXML = `
+
+ Test Page
+ This is a test page
+ This is a test paragraph
+`
+
+const custom404 = `404 not found`
+
func newUnstartedTestServer() *httptest.Server {
mux := http.NewServeMux()
@@ -69,13 +79,21 @@ func newUnstartedTestServer() *httptest.Server {
mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
- w.Write([]byte(`
-
- Test Page
- This is a test page
- This is a test paragraph
-
- `))
+ w.Write([]byte(testXML))
+ })
+
+ mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) {
+ ww := gzip.NewWriter(w)
+ defer ww.Close()
+ ww.Write([]byte(testXML))
+ })
+
+ mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) {
+ http.Error(w, custom404, http.StatusNotFound)
+ })
+
+ mux.HandleFunc("/empty-response.xml.gz", func(w http.ResponseWriter, r *http.Request) {
+ // write nothing
})
mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
@@ -1417,7 +1435,7 @@ func TestCollectorOnXMLWithHtml(t *testing.T) {
}
}
-func TestCollectorOnXMLWithXML(t *testing.T) {
+func testCollectorOnXMLWithXML(t *testing.T, path string) {
ts := newTestServer()
defer ts.Close()
@@ -1450,7 +1468,7 @@ func TestCollectorOnXMLWithXML(t *testing.T) {
}
})
- c.Visit(ts.URL + "/xml")
+ c.Visit(ts.URL + path)
if !titleCallbackCalled {
t.Error("Failed to call OnXML callback for
tag")
@@ -1461,6 +1479,72 @@ func TestCollectorOnXMLWithXML(t *testing.T) {
}
}
+func TestCollectorOnXMLWithXML(t *testing.T) {
+ testCollectorOnXMLWithXML(t, "/xml")
+}
+
+func TestCollectorOnXMLWithXMLCompressed(t *testing.T) {
+ testCollectorOnXMLWithXML(t, "/test.xml.gz")
+}
+
+func TestCollectorNonexistentXMLGZ(t *testing.T) {
+ // This is a regression test for colly
+ // attempting to decompress all .xml.gz URLs
+ // even if they're not compressed.
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector(ParseHTTPErrorResponse())
+
+ onResponseCalled := false
+
+ c.OnResponse(func(resp *Response) {
+ onResponseCalled = true
+ if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want {
+ t.Errorf("wrong response body got=%q want=%q", got, want)
+ }
+ })
+
+ c.OnError(func(resp *Response, err error) {
+ t.Errorf("called on OnError: err=%v", err)
+ })
+
+ c.Visit(ts.URL + "/nonexistent.xml.gz")
+
+ if !onResponseCalled {
+ t.Error("OnResponse was not called")
+ }
+}
+
+func TestCollectorEmptyXMLGZ(t *testing.T) {
+ // This is a regression test for colly
+ // attempting to decompress all .xml.gz URLs
+ // even if they're not compressed.
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+
+ onResponseCalled := false
+
+ c.OnResponse(func(resp *Response) {
+ onResponseCalled = true
+ if got, want := strings.TrimSpace(string(resp.Body)), ""; got != want {
+ t.Errorf("wrong response body got=%q want=%q", got, want)
+ }
+ })
+
+ c.OnError(func(resp *Response, err error) {
+ t.Errorf("called on OnError: err=%v", err)
+ })
+
+ c.Visit(ts.URL + "/empty-response.xml.gz")
+
+ if !onResponseCalled {
+ t.Error("OnResponse was not called")
+ }
+}
+
func TestCollectorVisitWithTrace(t *testing.T) {
ts := newTestServer()
defer ts.Close()
diff --git a/http_backend.go b/http_backend.go
index e580f7a2..01c6832c 100644
--- a/http_backend.go
+++ b/http_backend.go
@@ -15,6 +15,7 @@
package colly
import (
+ "bufio"
"crypto/sha1"
"encoding/gob"
"encoding/hex"
@@ -202,11 +203,27 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c
}
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
- bodyReader, err = gzip.NewReader(bodyReader)
- if err != nil {
+ // Even if URL contains .xml.gz, it doesn't mean that we get gzip
+ // compressed data back. We might get 404 error page instead,
+ // for example. So check gzip magic bytes.
+ bufReader := bufio.NewReader(bodyReader)
+ bodyReader = bufReader
+ magic, err := bufReader.Peek(2)
+ switch err {
+ case io.EOF:
+ // less than 2 bytes, do nothing
+ case nil:
+ // gzip magic, as specified in RFC 1952
+ if magic[0] == 0x1f && magic[1] == 0x8b {
+ bodyReader, err = gzip.NewReader(bufReader)
+ if err != nil {
+ return nil, err
+ }
+ defer bodyReader.(*gzip.Reader).Close()
+ }
+ default:
return nil, err
}
- defer bodyReader.(*gzip.Reader).Close()
}
body, err := io.ReadAll(bodyReader)
if err != nil {