Skip to content

Commit

Permalink
Merge branch 'feature/html2text' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
axllent committed Oct 14, 2023
2 parents 93da187 + e9d44c5 commit eeac32d
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
restore-keys: |
${{ runner.os }}-go-
- run: go test ./internal/storage ./server ./internal/tools -v
- run: go test ./internal/storage ./server ./internal/tools ./internal/tools/html2text -v
- run: go test ./internal/storage -bench=.

# build the assets
Expand Down
3 changes: 0 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,9 @@ require (
github.com/gorilla/mux v1.8.0
github.com/gorilla/websocket v1.5.0
github.com/jhillyerd/enmime v1.0.1
github.com/k3a/html2text v1.2.1
github.com/klauspost/compress v1.17.0
github.com/leporo/sqlf v1.4.0
github.com/mhale/smtpd v0.8.0
github.com/microcosm-cc/bluemonday v1.0.25
github.com/reiver/go-telnet v0.0.0-20180421082511-9ff0b2ab096e
github.com/satori/go.uuid v1.2.0
github.com/sirupsen/logrus v1.9.3
Expand All @@ -33,7 +31,6 @@ require (
github.com/DATA-DOG/go-sqlmock v1.5.0 // indirect
github.com/GehirnInc/crypt v0.0.0-20230320061759-8cc1b52080c5 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a // indirect
github.com/cznic/ql v1.2.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
Expand Down
16 changes: 0 additions & 16 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsVi
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/axllent/semver v0.0.1 h1:QqF+KSGxgj8QZzSXAvKFqjGWE5792ksOnQhludToK8E=
github.com/axllent/semver v0.0.1/go.mod h1:2xSPzvG8n9mRfdtxSvWvfTfQGWfHsMsHO1iZnKATMSc=
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a h1:MISbI8sU/PSK/ztvmWKFcI7UGb5/HQT7B+i3a2myKgI=
github.com/cention-sany/utf7 v0.0.0-20170124080048-26cad61bd60a/go.mod h1:2GxOXOlEPAMFPfp014mK1SWq8G8BN8o7/dfYqJrVGn8=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
Expand Down Expand Up @@ -59,8 +57,6 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ=
github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4=
github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
Expand All @@ -73,10 +69,6 @@ github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQykt
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jhillyerd/enmime v1.0.1 h1:y6RyqIgBOI2hIinOXIzmeB+ITRVls0zTJIm5GwgXnjE=
github.com/jhillyerd/enmime v1.0.1/go.mod h1:LMMbm6oTlzWHghPavqHtOrP/NosVv3l42CUrZjn03/Q=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs=
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM=
Expand All @@ -99,8 +91,6 @@ github.com/mattn/go-sqlite3 v1.14.16 h1:yOQRA0RpS5PFz/oikGwBEqvAWhWg5ufRz4ETLjwp
github.com/mattn/go-sqlite3 v1.14.16/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/mhale/smtpd v0.8.0 h1:5JvdsehCg33PQrZBvFyDMMUDQmvbzVpZgKob7eYBJc0=
github.com/mhale/smtpd v0.8.0/go.mod h1:MQl+y2hwIEQCXtNhe5+55n0GZOjSmeqORDIXbqUL3x4=
github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg=
github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
Expand All @@ -123,10 +113,6 @@ github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
Expand Down Expand Up @@ -166,7 +152,6 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
Expand Down Expand Up @@ -210,7 +195,6 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
Expand Down
10 changes: 3 additions & 7 deletions internal/storage/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import (

"github.com/axllent/mailpit/config"
"github.com/axllent/mailpit/internal/logger"
"github.com/axllent/mailpit/internal/tools/html2text"
"github.com/axllent/mailpit/server/websockets"
"github.com/jhillyerd/enmime"
"github.com/k3a/html2text"
"github.com/leporo/sqlf"
)

Expand All @@ -39,12 +39,8 @@ func createSearchText(env *enmime.Envelope) string {
b.WriteString(env.GetHeader("Bcc") + " ")
b.WriteString(env.GetHeader("Reply-To") + " ")
b.WriteString(env.GetHeader("Return-Path") + " ")
h := strings.TrimSpace(
html2text.HTML2TextWithOptions(
env.HTML,
html2text.WithLinksInnerText(),
),
)

h := html2text.Strip(env.HTML, true)
if h != "" {
b.WriteString(h + " ")
} else {
Expand Down
11 changes: 0 additions & 11 deletions internal/tools/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ package tools

import (
"fmt"
"strings"

"github.com/microcosm-cc/bluemonday"
"golang.org/x/net/html"
)

Expand All @@ -19,12 +17,3 @@ func GetHTMLAttributeVal(e *html.Node, key string) (string, error) {

return "", fmt.Errorf("%s not found", key)
}

// StripHTML returns text from an HTML string
func stripHTML(h string) string {
p := bluemonday.StrictPolicy()
// // ensure joining html elements are spaced apart, eg table cells etc
h = strings.ReplaceAll(h, "><", "> <")
// return p.Sanitize(h)
return html.UnescapeString(p.Sanitize(h))
}
72 changes: 72 additions & 0 deletions internal/tools/html2text/html2text.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Package html2text is a simple library to convert HTML to plain text
package html2text

import (
"bytes"
"log"
"regexp"
"strings"

"golang.org/x/net/html"
)

var (
re = regexp.MustCompile(`\s+`)
spaceRe = regexp.MustCompile(`(?mi)<\/(div|p|td|th|h[1-6]|ul|ol|li|address|article|aside|blockquote|dl|dt|footer|header|hr|main|nav|pre|table|thead|tfoot|video)><`)
brRe = regexp.MustCompile(`(?mi)<(br /|br)>`)
imgRe = regexp.MustCompile(`(?mi)<(img)`)
skip = make(map[string]bool)
)

func init() {
skip["script"] = true
skip["title"] = true
skip["head"] = true
skip["link"] = true
skip["meta"] = true
skip["style"] = true
skip["noscript"] = true
}

// Strip will convert a HTML string to plain text
func Strip(h string, includeLinks bool) string {
h = spaceRe.ReplaceAllString(h, "</$1> <")
h = brRe.ReplaceAllString(h, " ")
h = imgRe.ReplaceAllString(h, " <$1")
var buffer bytes.Buffer
doc, err := html.Parse(strings.NewReader(h))
if err != nil {
log.Fatal(err)
}

extract(doc, &buffer, includeLinks)
return clean(buffer.String())
}

func extract(node *html.Node, buff *bytes.Buffer, includeLinks bool) {
if node.Type == html.TextNode {
data := node.Data
if data != "" {
buff.WriteString(data)
}
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
if _, skip := skip[c.Data]; !skip {
if includeLinks && c.Data == "a" {
for _, a := range c.Attr {
if a.Key == "href" && strings.HasPrefix(strings.ToLower(a.Val), "http") {
buff.WriteString(" " + a.Val + " ")
}
}
}
extract(c, buff, includeLinks)
}
}
}

func clean(text string) string {
// replace \uFEFF with space, see https://github.com/golang/go/issues/42274#issuecomment-1017258184
text = strings.ReplaceAll(text, string('\uFEFF'), " ")
text = re.ReplaceAllString(text, " ")
return strings.TrimSpace(text)
}
56 changes: 56 additions & 0 deletions internal/tools/html2text/html2text_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package html2text

import "testing"

func TestPlain(t *testing.T) {
tests := map[string]string{}
tests["this is a test"] = "this is a test"
tests["thiS IS a Test"] = "thiS IS a Test"
tests["thiS IS a Test :-)"] = "thiS IS a Test :-)"
tests["<h1>This is a test.</h1> "] = "This is a test."
tests["<p>Paragraph 1</p><p>Paragraph 2</p>"] = "Paragraph 1 Paragraph 2"
tests["<h1>Heading</h1><p>Paragraph</p>"] = "Heading Paragraph"
tests["<span>Alpha</span>bet <strong>chars</strong>"] = "Alphabet chars"
tests["<span><b>A</b>lpha</span>bet chars."] = "Alphabet chars."
tests["<table><tr><td>First</td><td>Second</td></table>"] = "First Second"
tests[`<h1>Heading</h1>
<p>Paragraph</p>`] = "Heading Paragraph"
tests[`<h1>Heading</h1><p> <a href="https://github.com">linked text</a></p>`] = "Heading linked text"
// broken html
tests[`<h1>Heading</h3><p> <a href="https://github.com">linked text.`] = "Heading linked text."

for str, expected := range tests {
res := Strip(str, false)
if res != expected {
t.Log("error:", res, "!=", expected)
t.Fail()
}
}
}

func TestWithLinks(t *testing.T) {
tests := map[string]string{}
tests["this is a test"] = "this is a test"
tests["thiS IS a Test"] = "thiS IS a Test"
tests["thiS IS a Test :-)"] = "thiS IS a Test :-)"
tests["<h1>This is a test.</h1> "] = "This is a test."
tests["<p>Paragraph 1</p><p>Paragraph 2</p>"] = "Paragraph 1 Paragraph 2"
tests["<h1>Heading</h1><p>Paragraph</p>"] = "Heading Paragraph"
tests["<span>Alpha</span>bet <strong>chars</strong>"] = "Alphabet chars"
tests["<span><b>A</b>lpha</span>bet chars."] = "Alphabet chars."
tests["<table><tr><td>First</td><td>Second</td></table>"] = "First Second"
tests["<h1>Heading</h1><p>Paragraph</p>"] = "Heading Paragraph"
tests[`<h1>Heading</h1>
<p>Paragraph</p>`] = "Heading Paragraph"
tests[`<h1>Heading</h1><p> <a href="https://github.com">linked text</a></p>`] = "Heading https://github.com linked text"
// broken html
tests[`<h1>Heading</h3><p> <a href="https://github.com">linked text.`] = "Heading https://github.com linked text."

for str, expected := range tests {
res := Strip(str, true)
if res != expected {
t.Log("error:", res, "!=", expected)
t.Fail()
}
}
}
8 changes: 3 additions & 5 deletions internal/tools/snippets.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package tools
import (
"regexp"
"strings"

"github.com/axllent/mailpit/internal/tools/html2text"
)

// CreateSnippet returns a message snippet. It will use the HTML version (if it exists)
Expand All @@ -12,17 +14,13 @@ func CreateSnippet(text, html string) string {
html = strings.TrimSpace(html)
limit := 200
spaceRe := regexp.MustCompile(`\s+`)
nlRe := regexp.MustCompile(`\r?\n`)

if text == "" && html == "" {
return ""
}

if html != "" {
data := nlRe.ReplaceAllString(stripHTML(html), " ")
// replace \uFEFF with space, see https://github.com/golang/go/issues/42274#issuecomment-1017258184
data = strings.ReplaceAll(data, string('\uFEFF'), " ")
data = strings.TrimSpace(spaceRe.ReplaceAllString(data, " "))
data := html2text.Strip(html, false)

if len(data) <= limit {
return data
Expand Down

0 comments on commit eeac32d

Please sign in to comment.