diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 89e38cd..41f541a --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ As of v1.0.0 the API is stable and used in multiple (personal) projects. Unless ## Change Log +**2019-08-20** v1.2.0 words methods + **2019-02-14** v1.1.0 classes methods **2019-02-11** v1.0.0 initial release diff --git a/htmlutil.go b/htmlutil.go index 6bc832a..ffe5ff6 100755 --- a/htmlutil.go +++ b/htmlutil.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ // Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to // find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension. @@ -153,7 +153,14 @@ func (n Node) OuterHTML() string { // OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n` func (n Node) OuterText() string { - return encodeText(n.Data) + return string(encodeText(n.Data)) +} + +// OuterWords builds a space-separated string from the whitespace-separated data of all text nodes in the sub-tree, +// starting from and including `n`, note that text separated / split across multiple elements will be considered as +// multiple words (words within non-empty sibling elements will be split by a single space) +func (n Node) OuterWords() string { + return string(encodeWords(n.Data)) } // InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method) @@ -182,6 +189,25 @@ func (n Node) InnerText(filters ...func(node Node) bool) string { return string(b) } +// InnerWords builds a string using the outer words of all children matching all filters (see the `FindNode` method and +// the `OuterWords` methods) +func (n Node) InnerWords(filters ...func(node Node) bool) string { + var b []byte + n.Range( + func(i int, node Node) bool { + if s := node.OuterWords(); s != `` { + if len(b) != 0 { + b = append(b, ' ') + } + b = append(b, []byte(s)...) + } + return true + }, + filters..., + ) + return string(b) +} + // SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method) func (n Node) SiblingIndex(filters ...func(node Node) bool) int { return siblingIndex(n, filters...) diff --git a/htmlutil_test.go b/htmlutil_test.go index bdd8b53..a777262 100755 --- a/htmlutil_test.go +++ b/htmlutil_test.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ package htmlutil @@ -339,11 +339,27 @@ func TestEncodeHTML_panic(t *testing.T) { } func TestEncodeText_nil(t *testing.T) { - if v := encodeText(nil); v != "" { + if v := encodeText(nil); v != nil { t.Fatal(v) } } +func TestEncodeWords_nil(t *testing.T) { + if v := encodeWords(nil); v != nil { + t.Fatal(v) + } +} + +func TestEncodeWords_siblings(t *testing.T) { + node, err := Parse(strings.NewReader(`
one
two
three
four
`)) + if err != nil { + t.Fatal(err) + } + if v := string(encodeWords(node.Data)); v != `one two three four` { + t.Error(v) + } +} + func TestParse_eof(t *testing.T) { reader, _ := io.Pipe() _ = reader.Close() @@ -425,6 +441,20 @@ FOUR ! ` { t.Fatal(v) } + if v := node.InnerWords(); v != `ONE TWO THREE FOUR !` { + t.Fatal(v) + } + if v := node.InnerWords(func(node Node) bool { + return node.Offset() == 0 && + node.Type() == html.TextNode + }); v != `FOUR !` { + t.Fatal(v) + } + if v := node.InnerWords(func(node Node) bool { + return node.Offset() == 100 + }); v != `` { + t.Fatal(v) + } } func TestNode_GetAttr_caseInsensitive(t *testing.T) { diff --git a/internal.go b/internal.go index 7f30835..57aac8f 100755 --- a/internal.go +++ b/internal.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ package htmlutil @@ -154,7 +154,18 @@ func getNode(node Node, filters ...func(node Node) bool) Node { return result } -func encodeTextBytes(node *html.Node) []byte { +func encodeHTML(node *html.Node) string { + if node == nil { + return "" + } + buffer := new(bytes.Buffer) + if err := html.Render(buffer, node); err != nil { + panic(err) + } + return buffer.String() +} + +func encodeText(node *html.Node) []byte { if node == nil { return nil } @@ -163,24 +174,33 @@ func encodeTextBytes(node *html.Node) []byte { } var b []byte for node := node.FirstChild; node != nil; node = node.NextSibling { - b = append(b, encodeTextBytes(node)...) + b = append(b, encodeText(node)...) } return b } -func encodeText(node *html.Node) string { - return string(encodeTextBytes(node)) -} - -func encodeHTML(node *html.Node) string { +func encodeWords(node *html.Node) (b []byte) { if node == nil { - return "" + return } - buffer := new(bytes.Buffer) - if err := html.Render(buffer, node); err != nil { - panic(err) + if node.Type == html.TextNode { + for _, word := range strings.Fields(node.Data) { + if len(b) != 0 { + b = append(b, ' ') + } + b = append(b, []byte(word)...) + } + return } - return buffer.String() + for node := node.FirstChild; node != nil; node = node.NextSibling { + if words := encodeWords(node); len(words) != 0 { + if len(b) != 0 { + b = append(b, ' ') + } + b = append(b, words...) + } + } + return } func getAttr(namespace string, key string, attributes ...html.Attribute) (html.Attribute, bool) { diff --git a/internal_test.go b/internal_test.go index 49696f1..9a1bea7 100755 --- a/internal_test.go +++ b/internal_test.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ package htmlutil