From 341ed1df2064bfd34d2b2a1e1f4c8e3936e08e69 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Thu, 21 Dec 2023 00:29:16 -0300 Subject: [PATCH 1/8] Use list instead of regex for string split --- lib/floki/selector.ex | 2 +- lib/floki/selector/attribute_selector.ex | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index 706e1659..8d5f38af 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -144,7 +144,7 @@ defmodule Floki.Selector do defp do_classes_matches?(nil, _classes), do: false defp do_classes_matches?(class_attr_value, classes) do - classes -- String.split(class_attr_value, ~r/\s+/) == [] + classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] end defp attributes_matches?(_node, []), do: true diff --git a/lib/floki/selector/attribute_selector.ex b/lib/floki/selector/attribute_selector.ex index 5e8da68c..70e8e049 100644 --- a/lib/floki/selector/attribute_selector.ex +++ b/lib/floki/selector/attribute_selector.ex @@ -63,7 +63,7 @@ defmodule Floki.Selector.AttributeSelector do s.attribute |> get_value(attributes) # Splits by whitespaces ("a b c" -> ["a", "b", "c"]) - |> String.split(~r/\s+/) + |> String.split([" ", "\t", "\n"]) |> Enum.any?(fn v -> String.downcase(v) == selector_value end) end @@ -103,7 +103,7 @@ defmodule Floki.Selector.AttributeSelector do def match?(attributes, s = %AttributeSelector{match_type: :includes, value: value}) do get_value(s.attribute, attributes) - |> String.split(~r/\s+/) + |> String.split([" ", "\t", "\n"]) |> Enum.any?(fn v -> v == value end) end From e5858ffae4dabe69b465af0d84b625b923349524 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Fri, 22 Dec 2023 08:15:05 -0300 Subject: [PATCH 2/8] Skip class_attr_value split if size smaller than first class --- lib/floki/selector.ex | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index 8d5f38af..04b1126f 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -143,6 +143,11 @@ defmodule Floki.Selector do defp do_classes_matches?(nil, _classes), do: false + defp do_classes_matches?(class_attr_value, [class | _]) + when bit_size(class_attr_value) < bit_size(class) do + false + end + defp do_classes_matches?(class_attr_value, classes) do classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] end From d03143d2e22344b257dbae178c23a3e28a7a6594 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Fri, 22 Dec 2023 08:17:24 -0300 Subject: [PATCH 3/8] Skip class_attr_value split when same size as single class --- lib/floki/selector.ex | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index 04b1126f..c8770ccb 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -148,6 +148,11 @@ defmodule Floki.Selector do false end + defp do_classes_matches?(class_attr_value, [class]) + when bit_size(class_attr_value) == bit_size(class) do + class == class_attr_value + end + defp do_classes_matches?(class_attr_value, classes) do classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] end From 9ae289bb5e202010706da2c75ccf8d832260c4af Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Fri, 22 Dec 2023 08:20:24 -0300 Subject: [PATCH 4/8] Compute minimal size to try to skip class_attr_value split --- lib/floki/selector.ex | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index c8770ccb..5dca9c3c 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -153,8 +153,14 @@ defmodule Floki.Selector do class == class_attr_value end + defp do_classes_matches?(class_attr_value, [class]) do + Enum.member?(String.split(class_attr_value, [" ", "\t", "\n"]), class) + end + defp do_classes_matches?(class_attr_value, classes) do - classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] + min_size = Enum.reduce(classes, -1, fn item, acc -> acc + 1 + bit_size(item) end) + can_match? = bit_size(class_attr_value) >= min_size + can_match? && classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] end defp attributes_matches?(_node, []), do: true From 99e6a3233a6ecbfcca3e90b71c17810a8ac091e8 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Fri, 22 Dec 2023 12:43:49 -0300 Subject: [PATCH 5/8] Reorder selector classes to improve performance --- lib/floki/selector/parser.ex | 10 +++++++--- test/floki/selector/parser_test.exs | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/floki/selector/parser.ex b/lib/floki/selector/parser.ex index 8ef96d2b..404e3e26 100644 --- a/lib/floki/selector/parser.ex +++ b/lib/floki/selector/parser.ex @@ -37,9 +37,9 @@ defmodule Floki.Selector.Parser do do_parse_all(remaining_tokens, [selector | selectors]) end - defp do_parse([], selector), do: {selector, []} - defp do_parse([{:close_parentesis, _} | t], selector), do: {selector, t} - defp do_parse([{:comma, _} | t], selector), do: {selector, t} + defp do_parse([], selector), do: {optimize_selector(selector), []} + defp do_parse([{:close_parentesis, _} | t], selector), do: {optimize_selector(selector), t} + defp do_parse([{:comma, _} | t], selector), do: {optimize_selector(selector), t} defp do_parse([{:identifier, _, namespace}, {:namespace_pipe, _} | t], selector) do do_parse(t, %{selector | namespace: to_string(namespace)}) @@ -267,4 +267,8 @@ defmodule Floki.Selector.Parser do Logger.debug("Only simple selectors are allowed in :not() pseudo-class. Ignoring.") nil end + + defp optimize_selector(selector) do + %{selector | classes: Enum.sort(selector.classes, &(byte_size(&1) >= byte_size(&2)))} + end end diff --git a/test/floki/selector/parser_test.exs b/test/floki/selector/parser_test.exs index 23cbca8f..68847c55 100644 --- a/test/floki/selector/parser_test.exs +++ b/test/floki/selector/parser_test.exs @@ -31,6 +31,14 @@ defmodule Floki.Selector.ParserTest do ] end + test "reorders classes in selector to improve matching performance" do + tokens = tokenize(".small.longer.even-longer") + + assert Parser.parse(tokens) == [ + %Selector{classes: ["even-longer", "longer", "small"]} + ] + end + test "multiple selectors" do tokens = tokenize("ol, ul") From 75f9507283875b4d579ba4d74f24f19294fc49b0 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Fri, 22 Dec 2023 12:01:01 -0300 Subject: [PATCH 6/8] Skip when selector requires children and node doesn't have one --- lib/floki/selector.ex | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index 5dca9c3c..093a154e 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -78,12 +78,27 @@ defmodule Floki.Selector do def match?(%Comment{}, _selector, _tree), do: false def match?(html_node, selector, tree) do - id_match?(html_node, selector.id) && namespace_match?(html_node, selector.namespace) && - type_match?(html_node, selector.type) && classes_matches?(html_node, selector.classes) && + can_match_combinator?(html_node, selector.combinator) && + id_match?(html_node, selector.id) && + namespace_match?(html_node, selector.namespace) && + type_match?(html_node, selector.type) && + classes_matches?(html_node, selector.classes) && attributes_matches?(html_node, selector.attributes) && pseudo_classes_match?(html_node, selector.pseudo_classes, tree) end + defp can_match_combinator?(_node, nil), do: true + + defp can_match_combinator?( + %HTMLNode{children_nodes_ids: []}, + %Selector.Combinator{match_type: match_type} + ) + when match_type in [:child, :descendant] do + false + end + + defp can_match_combinator?(_node, _combinator), do: true + defp id_match?(_node, nil), do: true defp id_match?(%HTMLNode{attributes: []}, _), do: false defp id_match?(%HTMLNode{type: :pi}, _), do: false From fa64b01261028d47eb5d41c1f6aab840a781f636 Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Wed, 27 Dec 2023 20:56:04 -0300 Subject: [PATCH 7/8] Remove empty values on split --- lib/floki/selector.ex | 6 ++++-- lib/floki/selector/attribute_selector.ex | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/floki/selector.ex b/lib/floki/selector.ex index 093a154e..b40a96ca 100644 --- a/lib/floki/selector.ex +++ b/lib/floki/selector.ex @@ -169,13 +169,15 @@ defmodule Floki.Selector do end defp do_classes_matches?(class_attr_value, [class]) do - Enum.member?(String.split(class_attr_value, [" ", "\t", "\n"]), class) + class_attr_value + |> String.split([" ", "\t", "\n"], trim: true) + |> Enum.member?(class) end defp do_classes_matches?(class_attr_value, classes) do min_size = Enum.reduce(classes, -1, fn item, acc -> acc + 1 + bit_size(item) end) can_match? = bit_size(class_attr_value) >= min_size - can_match? && classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == [] + can_match? && classes -- String.split(class_attr_value, [" ", "\t", "\n"], trim: true) == [] end defp attributes_matches?(_node, []), do: true diff --git a/lib/floki/selector/attribute_selector.ex b/lib/floki/selector/attribute_selector.ex index 70e8e049..3ef3c4d2 100644 --- a/lib/floki/selector/attribute_selector.ex +++ b/lib/floki/selector/attribute_selector.ex @@ -63,7 +63,7 @@ defmodule Floki.Selector.AttributeSelector do s.attribute |> get_value(attributes) # Splits by whitespaces ("a b c" -> ["a", "b", "c"]) - |> String.split([" ", "\t", "\n"]) + |> String.split([" ", "\t", "\n"], trim: true) |> Enum.any?(fn v -> String.downcase(v) == selector_value end) end @@ -103,8 +103,8 @@ defmodule Floki.Selector.AttributeSelector do def match?(attributes, s = %AttributeSelector{match_type: :includes, value: value}) do get_value(s.attribute, attributes) - |> String.split([" ", "\t", "\n"]) - |> Enum.any?(fn v -> v == value end) + |> String.split([" ", "\t", "\n"], trim: true) + |> Enum.member?(value) end def match?(attributes, s = %AttributeSelector{match_type: :dash_match}) do From e4a8a8a9c2c6106fe7c5c23543c39f2969a5b57f Mon Sep 17 00:00:00 2001 From: Yuri Pereira Constante Date: Wed, 27 Dec 2023 21:33:42 -0300 Subject: [PATCH 8/8] Add comment to optimize_selector --- lib/floki/selector/parser.ex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/floki/selector/parser.ex b/lib/floki/selector/parser.ex index 404e3e26..dd32ca5d 100644 --- a/lib/floki/selector/parser.ex +++ b/lib/floki/selector/parser.ex @@ -268,7 +268,8 @@ defmodule Floki.Selector.Parser do nil end + # Reorders classes in selector to improve matching performance. defp optimize_selector(selector) do - %{selector | classes: Enum.sort(selector.classes, &(byte_size(&1) >= byte_size(&2)))} + %{selector | classes: Enum.sort(selector.classes, &(bit_size(&1) >= bit_size(&2)))} end end