Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize selectors matching #510

Merged
merged 8 commits into from
Dec 28, 2023
37 changes: 34 additions & 3 deletions lib/floki/selector.ex
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,27 @@ defmodule Floki.Selector do
def match?(%Comment{}, _selector, _tree), do: false

def match?(html_node, selector, tree) do
id_match?(html_node, selector.id) && namespace_match?(html_node, selector.namespace) &&
type_match?(html_node, selector.type) && classes_matches?(html_node, selector.classes) &&
can_match_combinator?(html_node, selector.combinator) &&
id_match?(html_node, selector.id) &&
namespace_match?(html_node, selector.namespace) &&
type_match?(html_node, selector.type) &&
classes_matches?(html_node, selector.classes) &&
attributes_matches?(html_node, selector.attributes) &&
pseudo_classes_match?(html_node, selector.pseudo_classes, tree)
end

defp can_match_combinator?(_node, nil), do: true

defp can_match_combinator?(
%HTMLNode{children_nodes_ids: []},
%Selector.Combinator{match_type: match_type}
)
when match_type in [:child, :descendant] do
false
end

defp can_match_combinator?(_node, _combinator), do: true

defp id_match?(_node, nil), do: true
defp id_match?(%HTMLNode{attributes: []}, _), do: false
defp id_match?(%HTMLNode{type: :pi}, _), do: false
Expand Down Expand Up @@ -143,8 +158,24 @@ defmodule Floki.Selector do

defp do_classes_matches?(nil, _classes), do: false

defp do_classes_matches?(class_attr_value, [class | _])
when bit_size(class_attr_value) < bit_size(class) do
false
end

defp do_classes_matches?(class_attr_value, [class])
when bit_size(class_attr_value) == bit_size(class) do
class == class_attr_value
end

defp do_classes_matches?(class_attr_value, [class]) do
Enum.member?(String.split(class_attr_value, [" ", "\t", "\n"]), class)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not exactly the same from the regex version, because the occurrence of multiple whitespace chars in sequence are treated differently.

For example, if we have a class attribute that contains multiple spaces between classes values, the former version would capture that correctly, while the new version does not:

class_attr_value = "my class\tanother   one"

# new
String.split(class_attr_value, [" ", "\t", "\n"])
#=> ["my", "class", "another", "", "", "one"]

# old
String.split(class_attr_value, ~r/\s+/)
#=> ["my", "class", "another", "one"]

We can, however, reject the empty binaries from the match. Can you try this and see if there is an impact in performance?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, fa64b01.
Adding trim: true didn't had performance impact in the benchmark, but it did reduce the memory usage, since we won't have this empty strings in the list.

end

defp do_classes_matches?(class_attr_value, classes) do
classes -- String.split(class_attr_value, ~r/\s+/) == []
min_size = Enum.reduce(classes, -1, fn item, acc -> acc + 1 + bit_size(item) end)
can_match? = bit_size(class_attr_value) >= min_size
can_match? && classes -- String.split(class_attr_value, [" ", "\t", "\n"]) == []
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

end

defp attributes_matches?(_node, []), do: true
Expand Down
4 changes: 2 additions & 2 deletions lib/floki/selector/attribute_selector.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ defmodule Floki.Selector.AttributeSelector do
s.attribute
|> get_value(attributes)
# Splits by whitespaces ("a b c" -> ["a", "b", "c"])
|> String.split(~r/\s+/)
|> String.split([" ", "\t", "\n"])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here as well. I think we need to consider the same problems here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

|> Enum.any?(fn v -> String.downcase(v) == selector_value end)
end

Expand Down Expand Up @@ -103,7 +103,7 @@ defmodule Floki.Selector.AttributeSelector do

def match?(attributes, s = %AttributeSelector{match_type: :includes, value: value}) do
get_value(s.attribute, attributes)
|> String.split(~r/\s+/)
|> String.split([" ", "\t", "\n"])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

|> Enum.any?(fn v -> v == value end)
end

Expand Down
10 changes: 7 additions & 3 deletions lib/floki/selector/parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ defmodule Floki.Selector.Parser do
do_parse_all(remaining_tokens, [selector | selectors])
end

defp do_parse([], selector), do: {selector, []}
defp do_parse([{:close_parentesis, _} | t], selector), do: {selector, t}
defp do_parse([{:comma, _} | t], selector), do: {selector, t}
defp do_parse([], selector), do: {optimize_selector(selector), []}
defp do_parse([{:close_parentesis, _} | t], selector), do: {optimize_selector(selector), t}
defp do_parse([{:comma, _} | t], selector), do: {optimize_selector(selector), t}

defp do_parse([{:identifier, _, namespace}, {:namespace_pipe, _} | t], selector) do
do_parse(t, %{selector | namespace: to_string(namespace)})
Expand Down Expand Up @@ -267,4 +267,8 @@ defmodule Floki.Selector.Parser do
Logger.debug("Only simple selectors are allowed in :not() pseudo-class. Ignoring.")
nil
end

defp optimize_selector(selector) do
philss marked this conversation as resolved.
Show resolved Hide resolved
%{selector | classes: Enum.sort(selector.classes, &(byte_size(&1) >= byte_size(&2)))}
end
end
8 changes: 8 additions & 0 deletions test/floki/selector/parser_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ defmodule Floki.Selector.ParserTest do
]
end

test "reorders classes in selector to improve matching performance" do
tokens = tokenize(".small.longer.even-longer")

assert Parser.parse(tokens) == [
%Selector{classes: ["even-longer", "longer", "small"]}
]
end

test "multiple selectors" do
tokens = tokenize("ol, ul")

Expand Down
Loading