diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 89e38cd..41f541a --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ As of v1.0.0 the API is stable and used in multiple (personal) projects. Unless ## Change Log +**2019-08-20** v1.2.0 words methods + **2019-02-14** v1.1.0 classes methods **2019-02-11** v1.0.0 initial release diff --git a/htmlutil.go b/htmlutil.go index 6bc832a..ffe5ff6 100755 --- a/htmlutil.go +++ b/htmlutil.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ // Package htmlutil implements a wrapper for Golang's html5 tokeniser / parser implementation, making it much easier to // find and extract information, aiming to be powerful and intuitive while remaining a minimal and logical extension. @@ -153,7 +153,14 @@ func (n Node) OuterHTML() string { // OuterText builds a string from the data of all text nodes in the sub-tree, starting from and including `n` func (n Node) OuterText() string { - return encodeText(n.Data) + return string(encodeText(n.Data)) +} + +// OuterWords builds a space-separated string from the whitespace-separated data of all text nodes in the sub-tree, +// starting from and including `n`, note that text separated / split across multiple elements will be considered as +// multiple words (words within non-empty sibling elements will be split by a single space) +func (n Node) OuterWords() string { + return string(encodeWords(n.Data)) } // InnerHTML builds a string using the outer html of all children matching all filters (see the `FindNode` method) @@ -182,6 +189,25 @@ func (n Node) InnerText(filters ...func(node Node) bool) string { return string(b) } +// InnerWords builds a string using the outer words of all children matching all filters (see the `FindNode` method and +// the `OuterWords` methods) +func (n Node) InnerWords(filters ...func(node Node) bool) string { + var b []byte + n.Range( + func(i int, node Node) bool { + if s := node.OuterWords(); s != `` { + if len(b) != 0 { + b = append(b, ' ') + } + b = append(b, []byte(s)...) + } + return true + }, + filters..., + ) + return string(b) +} + // SiblingIndex returns the total number of previous siblings matching any filters (see the `FindNode` method) func (n Node) SiblingIndex(filters ...func(node Node) bool) int { return siblingIndex(n, filters...) diff --git a/htmlutil_test.go b/htmlutil_test.go index bdd8b53..a777262 100755 --- a/htmlutil_test.go +++ b/htmlutil_test.go @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - */ +*/ package htmlutil @@ -339,11 +339,27 @@ func TestEncodeHTML_panic(t *testing.T) { } func TestEncodeText_nil(t *testing.T) { - if v := encodeText(nil); v != "" { + if v := encodeText(nil); v != nil { t.Fatal(v) } } +func TestEncodeWords_nil(t *testing.T) { + if v := encodeWords(nil); v != nil { + t.Fatal(v) + } +} + +func TestEncodeWords_siblings(t *testing.T) { + node, err := Parse(strings.NewReader(`