Skip to content

Commit

Permalink
Fix data URLs with spaces (#349)
Browse files Browse the repository at this point in the history
  • Loading branch information
raviqqe authored Nov 25, 2023
1 parent 0b82260 commit b2f1326
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 4 deletions.
25 changes: 21 additions & 4 deletions link_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"net/url"
"regexp"
"strings"
"unicode"

"github.com/yhat/scrape"
"golang.org/x/net/html"
Expand All @@ -22,7 +23,7 @@ var atomToAttributes = map[atom.Atom][]string{
atom.Meta: {"content"},
}

var imageDescriptorPattern = regexp.MustCompile(" [^ ]*$")
var imageDescriptorPattern = regexp.MustCompile(`(\S)\s+\S+\s*$`)

type linkFinder struct {
linkFilterer linkFilterer
Expand All @@ -43,7 +44,7 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error {
ss := f.parseLinks(n, a)

for _, s := range ss {
s := strings.TrimSpace(s)
s := f.trimUrl(s)

if s == "" {
continue
Expand All @@ -67,14 +68,14 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error {
return ls
}

func (linkFinder) parseLinks(n *html.Node, a string) []string {
func (f linkFinder) parseLinks(n *html.Node, a string) []string {
s := scrape.Attr(n, a)
ss := []string{}

switch a {
case "srcset":
for _, s := range strings.Split(s, ",") {
ss = append(ss, imageDescriptorPattern.ReplaceAllString(strings.TrimSpace(s), ""))
ss = append(ss, f.trimUrl(imageDescriptorPattern.ReplaceAllString(s, "$1")))
}
case "content":
switch scrape.Attr(n, "property") {
Expand All @@ -87,3 +88,19 @@ func (linkFinder) parseLinks(n *html.Node, a string) []string {

return ss
}

func (linkFinder) trimUrl(s string) string {
s = strings.TrimSpace(s)

if !strings.HasPrefix(s, "data:") {
return s
}

return strings.Map(func(r rune) rune {
if unicode.IsSpace(r) {
return -1
}

return r
}, s)
}
14 changes: 14 additions & 0 deletions link_finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,20 @@ func TestLinkFinderFindMetaTags(t *testing.T) {
assert.Nil(t, err)
}

func TestLinkFinderFindDataSchemeLinkWithSpaces(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)

n, err := html.Parse(strings.NewReader(
htmlWithBody(`<a href="data:text/plain, Hello,%20world! " />`)),
)
assert.Nil(t, err)

ls := newTestLinkFinder().Find(n, b)

assert.Len(t, ls, 0)
}

func TestLinkFinderIgnoreMetaTags(t *testing.T) {
b, err := url.Parse("http://foo.com")
assert.Nil(t, err)
Expand Down

0 comments on commit b2f1326

Please sign in to comment.