From a9fe0d8e580b2c1167960e73befdeb48a316c1b8 Mon Sep 17 00:00:00 2001 From: Ralph Slooten Date: Sat, 14 Oct 2023 22:28:14 +1300 Subject: [PATCH 1/2] Chore: Replace html2text modules with simplified internal function The module microcosm-cc/bluemonday now requires Go v1.21 and is quite frankly an overkill as Mailpit only needs to convert HTML to a single line (no formatting). --- go.mod | 3 -- go.sum | 16 ------ internal/storage/utils.go | 10 ++-- internal/tools/html.go | 11 ---- internal/tools/html2text/html2text.go | 72 +++++++++++++++++++++++++++ internal/tools/snippets.go | 8 ++- 6 files changed, 78 insertions(+), 42 deletions(-) create mode 100644 internal/tools/html2text/html2text.go diff --git a/go.mod b/go.mod index 7276b1a47..de3d85d1d 100644 --- a/go.mod +++ b/go.mod @@ -11,11 +11,9 @@ require ( github.com/gorilla/mux v1.8.0 github.com/gorilla/websocket v1.5.0 github.com/jhillyerd/enmime v1.0.1 - github.com/k3a/html2text v1.2.1 github.com/klauspost/compress v1.17.0 github.com/leporo/sqlf v1.4.0 github.com/mhale/smtpd v0.8.0 - github.com/microcosm-cc/bluemonday v1.0.25 github.com/reiver/go-telnet v0.0.0-20180421082511-9ff0b2ab096e github.com/satori/go.uuid v1.2.0 github.com/sirupsen/logrus v1.9.3 @@ -33,7 +31,6 @@ require ( github.com/DATA-DOG/go-sqlmock v1.5.0 // indirect github.com/GehirnInc/crypt v0.0.0-20230320061759-8cc1b52080c5 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect - github.com/aymerick/douceur v0.2.0 // indirect github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a // indirect github.com/cznic/ql v1.2.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect diff --git a/go.sum b/go.sum index 87fc6de3e..c5a4f444f 100644 --- a/go.sum +++ b/go.sum @@ -13,8 +13,6 @@ github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsVi github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/axllent/semver v0.0.1 h1:QqF+KSGxgj8QZzSXAvKFqjGWE5792ksOnQhludToK8E= github.com/axllent/semver v0.0.1/go.mod h1:2xSPzvG8n9mRfdtxSvWvfTfQGWfHsMsHO1iZnKATMSc= -github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= -github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a h1:MISbI8sU/PSK/ztvmWKFcI7UGb5/HQT7B+i3a2myKgI= github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a/go.mod h1:2GxOXOlEPAMFPfp014mK1SWq8G8BN8o7/dfYqJrVGn8= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -59,8 +57,6 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= -github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= @@ -73,10 +69,6 @@ github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQykt github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/jhillyerd/enmime v1.0.1 h1:y6RyqIgBOI2hIinOXIzmeB+ITRVls0zTJIm5GwgXnjE= github.com/jhillyerd/enmime v1.0.1/go.mod h1:LMMbm6oTlzWHghPavqHtOrP/NosVv3l42CUrZjn03/Q= -github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= -github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= -github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY= -github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM= @@ -99,8 +91,6 @@ github.com/mattn/go-sqlite3 v1.14.16 h1:yOQRA0RpS5PFz/oikGwBEqvAWhWg5ufRz4ETLjwp github.com/mattn/go-sqlite3 v1.14.16/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= github.com/mhale/smtpd v0.8.0 h1:5JvdsehCg33PQrZBvFyDMMUDQmvbzVpZgKob7eYBJc0= github.com/mhale/smtpd v0.8.0/go.mod h1:MQl+y2hwIEQCXtNhe5+55n0GZOjSmeqORDIXbqUL3x4= -github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg= -github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -123,10 +113,6 @@ github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= -github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= -github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= -github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -166,7 +152,6 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -210,7 +195,6 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= diff --git a/internal/storage/utils.go b/internal/storage/utils.go index 58f91b0ab..877b993fa 100644 --- a/internal/storage/utils.go +++ b/internal/storage/utils.go @@ -11,9 +11,9 @@ import ( "github.com/axllent/mailpit/config" "github.com/axllent/mailpit/internal/logger" + "github.com/axllent/mailpit/internal/tools/html2text" "github.com/axllent/mailpit/server/websockets" "github.com/jhillyerd/enmime" - "github.com/k3a/html2text" "github.com/leporo/sqlf" ) @@ -39,12 +39,8 @@ func createSearchText(env *enmime.Envelope) string { b.WriteString(env.GetHeader("Bcc") + " ") b.WriteString(env.GetHeader("Reply-To") + " ") b.WriteString(env.GetHeader("Return-Path") + " ") - h := strings.TrimSpace( - html2text.HTML2TextWithOptions( - env.HTML, - html2text.WithLinksInnerText(), - ), - ) + + h := html2text.Strip(env.HTML, true) if h != "" { b.WriteString(h + " ") } else { diff --git a/internal/tools/html.go b/internal/tools/html.go index 06de636b7..6e5e8832b 100644 --- a/internal/tools/html.go +++ b/internal/tools/html.go @@ -2,9 +2,7 @@ package tools import ( "fmt" - "strings" - "github.com/microcosm-cc/bluemonday" "golang.org/x/net/html" ) @@ -19,12 +17,3 @@ func GetHTMLAttributeVal(e *html.Node, key string) (string, error) { return "", fmt.Errorf("%s not found", key) } - -// StripHTML returns text from an HTML string -func stripHTML(h string) string { - p := bluemonday.StrictPolicy() - // // ensure joining html elements are spaced apart, eg table cells etc - h = strings.ReplaceAll(h, "><", "> <") - // return p.Sanitize(h) - return html.UnescapeString(p.Sanitize(h)) -} diff --git a/internal/tools/html2text/html2text.go b/internal/tools/html2text/html2text.go new file mode 100644 index 000000000..0940d067a --- /dev/null +++ b/internal/tools/html2text/html2text.go @@ -0,0 +1,72 @@ +// Package html2text is a simple library to convert HTML to plain text +package html2text + +import ( + "bytes" + "log" + "regexp" + "strings" + + "golang.org/x/net/html" +) + +var ( + re = regexp.MustCompile(`\s+`) + spaceRe = regexp.MustCompile(`(?mi)<\/(div|p|td|th|h[1-6]|ul|ol|li|address|article|aside|blockquote|dl|dt|footer|header|hr|main|nav|pre|table|thead|tfoot|video)><`) + brRe = regexp.MustCompile(`(?mi)<(br /|br)>`) + imgRe = regexp.MustCompile(`(?mi)<(img)`) + skip = make(map[string]bool) +) + +func init() { + skip["script"] = true + skip["title"] = true + skip["head"] = true + skip["link"] = true + skip["meta"] = true + skip["style"] = true + skip["noscript"] = true +} + +// Strip will convert a HTML string to plain text +func Strip(h string, includeLinks bool) string { + h = spaceRe.ReplaceAllString(h, " <") + h = brRe.ReplaceAllString(h, " ") + h = imgRe.ReplaceAllString(h, " <$1") + var buffer bytes.Buffer + doc, err := html.Parse(strings.NewReader(h)) + if err != nil { + log.Fatal(err) + } + + extract(doc, &buffer, includeLinks) + return clean(buffer.String()) +} + +func extract(node *html.Node, buff *bytes.Buffer, includeLinks bool) { + if node.Type == html.TextNode { + data := node.Data + if data != "" { + buff.WriteString(data) + } + } + for c := node.FirstChild; c != nil; c = c.NextSibling { + if _, skip := skip[c.Data]; !skip { + if includeLinks && c.Data == "a" { + for _, a := range c.Attr { + if a.Key == "href" && strings.HasPrefix(strings.ToLower(a.Val), "http") { + buff.WriteString(" " + a.Val + " ") + } + } + } + extract(c, buff, includeLinks) + } + } +} + +func clean(text string) string { + // replace \uFEFF with space, see https://github.com/golang/go/issues/42274#issuecomment-1017258184 + text = strings.ReplaceAll(text, string('\uFEFF'), " ") + text = re.ReplaceAllString(text, " ") + return strings.TrimSpace(text) +} diff --git a/internal/tools/snippets.go b/internal/tools/snippets.go index 5b8510f28..1322caf17 100644 --- a/internal/tools/snippets.go +++ b/internal/tools/snippets.go @@ -3,6 +3,8 @@ package tools import ( "regexp" "strings" + + "github.com/axllent/mailpit/internal/tools/html2text" ) // CreateSnippet returns a message snippet. It will use the HTML version (if it exists) @@ -12,17 +14,13 @@ func CreateSnippet(text, html string) string { html = strings.TrimSpace(html) limit := 200 spaceRe := regexp.MustCompile(`\s+`) - nlRe := regexp.MustCompile(`\r?\n`) if text == "" && html == "" { return "" } if html != "" { - data := nlRe.ReplaceAllString(stripHTML(html), " ") - // replace \uFEFF with space, see https://github.com/golang/go/issues/42274#issuecomment-1017258184 - data = strings.ReplaceAll(data, string('\uFEFF'), " ") - data = strings.TrimSpace(spaceRe.ReplaceAllString(data, " ")) + data := html2text.Strip(html, false) if len(data) <= limit { return data From e9d44c55a1fe87ec45071065c7b9b1625d828d2c Mon Sep 17 00:00:00 2001 From: Ralph Slooten Date: Sat, 14 Oct 2023 22:28:52 +1300 Subject: [PATCH 2/2] Tests: Add html2text tests --- .github/workflows/tests.yml | 2 +- internal/tools/html2text/html2text_test.go | 56 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 internal/tools/html2text/html2text_test.go diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e26ce529f..c26e25830 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,7 +24,7 @@ jobs: key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} restore-keys: | ${{ runner.os }}-go- - - run: go test ./internal/storage ./server ./internal/tools -v + - run: go test ./internal/storage ./server ./internal/tools ./internal/tools/html2text -v - run: go test ./internal/storage -bench=. # build the assets diff --git a/internal/tools/html2text/html2text_test.go b/internal/tools/html2text/html2text_test.go new file mode 100644 index 000000000..dd1763918 --- /dev/null +++ b/internal/tools/html2text/html2text_test.go @@ -0,0 +1,56 @@ +package html2text + +import "testing" + +func TestPlain(t *testing.T) { + tests := map[string]string{} + tests["this is a test"] = "this is a test" + tests["thiS IS a Test"] = "thiS IS a Test" + tests["thiS IS a Test :-)"] = "thiS IS a Test :-)" + tests["

This is a test.

"] = "This is a test." + tests["

Paragraph 1

Paragraph 2

"] = "Paragraph 1 Paragraph 2" + tests["

Heading

Paragraph

"] = "Heading Paragraph" + tests["Alphabet chars"] = "Alphabet chars" + tests["Alphabet chars."] = "Alphabet chars." + tests["
FirstSecond
"] = "First Second" + tests[`

Heading

+

Paragraph

`] = "Heading Paragraph" + tests[`

Heading

linked text

`] = "Heading linked text" + // broken html + tests[`

Heading

linked text.`] = "Heading linked text." + + for str, expected := range tests { + res := Strip(str, false) + if res != expected { + t.Log("error:", res, "!=", expected) + t.Fail() + } + } +} + +func TestWithLinks(t *testing.T) { + tests := map[string]string{} + tests["this is a test"] = "this is a test" + tests["thiS IS a Test"] = "thiS IS a Test" + tests["thiS IS a Test :-)"] = "thiS IS a Test :-)" + tests["

This is a test.

"] = "This is a test." + tests["

Paragraph 1

Paragraph 2

"] = "Paragraph 1 Paragraph 2" + tests["

Heading

Paragraph

"] = "Heading Paragraph" + tests["Alphabet chars"] = "Alphabet chars" + tests["Alphabet chars."] = "Alphabet chars." + tests["
FirstSecond
"] = "First Second" + tests["

Heading

Paragraph

"] = "Heading Paragraph" + tests[`

Heading

+

Paragraph

`] = "Heading Paragraph" + tests[`

Heading

linked text

`] = "Heading https://github.com linked text" + // broken html + tests[`

Heading

linked text.`] = "Heading https://github.com linked text." + + for str, expected := range tests { + res := Strip(str, true) + if res != expected { + t.Log("error:", res, "!=", expected) + t.Fail() + } + } +}