Skip to content

Commit

Permalink
HTML API: Add class name utilities has_class() and class_list().
Browse files Browse the repository at this point in the history
This patch adds two new public methods to the HTML Tag Processor:
 - `has_class()` indicates if a matched tag contains a given CSS class name.
 - `class_list()` returns a generator to iterate over all the class names in a matched tag.

Included in this patch is a refactoring of the internal logic when matching
a tag to reuse the new `has_class()` function. Previously it was relying on
optimized code in the `matches()` function which performed byte-for-byte
class name comparison. With the change in this patch it will perform class
name matching on the decoded value, which might differ if a class attribute
contains character references.

These methods may be useful for running more complicated queries based
on the presence or absence of CSS class names. The use of these methods
avoids the need to manually decode the class attribute as reported by
`$process->get_attribute( 'class' )`.

Props dmsnell.
Fixes #59209.

git-svn-id: https://develop.svn.wordpress.org/trunk@56703 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
ockham committed Sep 26, 2023
1 parent 086010a commit cecc810
Show file tree
Hide file tree
Showing 2 changed files with 244 additions and 58 deletions.
147 changes: 89 additions & 58 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,94 @@ public function next_tag( $query = null ) {
}


/**
* Generator for a foreach loop to step through each class name for the matched tag.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( "<div class='free &lt;egg&lt;\tlang-en'>" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free <egg> lang-en "
*
* @since 6.4.0
*/
public function class_list() {
/** @var string $class contains the string value of the class attribute, with character references decoded. */
$class = $this->get_attribute( 'class' );

if ( ! is_string( $class ) ) {
return;
}

$seen = array();

$at = 0;
while ( $at < strlen( $class ) ) {
// Skip past any initial boundary characters.
$at += strspn( $class, " \t\f\r\n", $at );
if ( $at >= strlen( $class ) ) {
return;
}

// Find the byte length until the next boundary.
$length = strcspn( $class, " \t\f\r\n", $at );
if ( 0 === $length ) {
return;
}

/*
* CSS class names are case-insensitive in the ASCII range.
*
* @see https://www.w3.org/TR/CSS2/syndata.html#x1
*/
$name = strtolower( substr( $class, $at, $length ) );
$at += $length;

/*
* It's expected that the number of class names for a given tag is relatively small.
* Given this, it is probably faster overall to scan an array for a value rather
* than to use the class name as a key and check if it's a key of $seen.
*/
if ( in_array( $name, $seen, true ) ) {
continue;
}

$seen[] = $name;
yield $name;
}
}


/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.4.0
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
if ( ! $this->tag_name_starts_at ) {
return null;
}

$wanted_class = strtolower( $wanted_class );

foreach ( $this->class_list() as $class_name ) {
if ( $class_name === $wanted_class ) {
return true;
}
}

return false;
}


/**
* Sets a bookmark in the HTML document.
*
Expand Down Expand Up @@ -2347,64 +2435,7 @@ private function matches() {
}
}

$needs_class_name = null !== $this->sought_class_name;

if ( $needs_class_name && ! isset( $this->attributes['class'] ) ) {
return false;
}

/*
* Match byte-for-byte (case-sensitive and encoding-form-sensitive) on the class name.
*
* This will overlook certain classes that exist in other lexical variations
* than was supplied to the search query, but requires more complicated searching.
*/
if ( $needs_class_name ) {
$class_start = $this->attributes['class']->value_starts_at;
$class_end = $class_start + $this->attributes['class']->value_length;
$class_at = $class_start;

/*
* Ensure that boundaries surround the class name to avoid matching on
* substrings of a longer name. For example, the sequence "not-odd"
* should not match for the class "odd" even though "odd" is found
* within the class attribute text.
*
* See https://html.spec.whatwg.org/#attributes-3
* See https://html.spec.whatwg.org/#space-separated-tokens
*/
while (
// phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) &&
$class_at < $class_end
) {
/*
* Verify this class starts at a boundary.
*/
if ( $class_at > $class_start ) {
$character = $this->html[ $class_at - 1 ];

if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
$class_at += strlen( $this->sought_class_name );
continue;
}
}

/*
* Verify this class ends at a boundary as well.
*/
if ( $class_at + strlen( $this->sought_class_name ) < $class_end ) {
$character = $this->html[ $class_at + strlen( $this->sought_class_name ) ];

if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
$class_at += strlen( $this->sought_class_name );
continue;
}
}

return true;
}

if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
return false;
}

Expand Down
155 changes: 155 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlTagProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,17 @@ public function test_next_tag_should_return_false_for_a_non_existing_tag() {
$this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::next_tag
*/
public function test_next_tag_matches_decoded_class_names() {
$p = new WP_HTML_Tag_Processor( '<div class="&lt;egg&gt;">' );

$this->assertTrue( $p->next_tag( array( 'class_name' => '<egg>' ) ), 'Failed to find tag with HTML-encoded class name.' );
}

/**
* @ticket 56299
* @ticket 57852
Expand Down Expand Up @@ -1957,6 +1968,150 @@ public function data_next_tag_ignores_contents_of_rawtext_tags() {
);
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_empty_when_missing_class() {
$p = new WP_HTML_Tag_Processor( '<div>' );
$p->next_tag();

$found_classes = false;
foreach ( $p->class_list() as $class ) {
$found_classes = true;
}

$this->assertFalse( $found_classes, 'Found classes when none exist.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_empty_when_class_is_boolean() {
$p = new WP_HTML_Tag_Processor( '<div class>' );
$p->next_tag();

$found_classes = false;
foreach ( $p->class_list() as $class ) {
$found_classes = true;
}

$this->assertFalse( $found_classes, 'Found classes when none exist.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_empty_when_class_is_empty() {
$p = new WP_HTML_Tag_Processor( '<div class="">' );
$p->next_tag();

$found_classes = false;
foreach ( $p->class_list() as $class ) {
$found_classes = true;
}

$this->assertFalse( $found_classes, 'Found classes when none exist.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_visits_each_class_in_order() {
$p = new WP_HTML_Tag_Processor( '<div class="one two three">' );
$p->next_tag();

$found_classes = array();
foreach ( $p->class_list() as $class ) {
$found_classes[] = $class;
}

$this->assertSame( array( 'one', 'two', 'three' ), $found_classes, 'Failed to visit the class names in their original order.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_decodes_class_names() {
$p = new WP_HTML_Tag_Processor( '<div class="&notin;-class &lt;egg&gt; &#xff03;">' );
$p->next_tag();

$found_classes = array();
foreach ( $p->class_list() as $class ) {
$found_classes[] = $class;
}

$this->assertSame( array( '∉-class', '<egg>', "\u{ff03}" ), $found_classes, 'Failed to report class names in their decoded form.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::class_list
*/
public function test_class_list_visits_unique_class_names_only_once() {
$p = new WP_HTML_Tag_Processor( '<div class="one one &#x6f;ne">' );
$p->next_tag();

$found_classes = array();
foreach ( $p->class_list() as $class ) {
$found_classes[] = $class;
}

$this->assertSame( array( 'one' ), $found_classes, 'Visited multiple copies of the same class name when it should have skipped the duplicates.' );
}

/**
* @ticket 59209
*
* @covers WP_HTML_Tag_Processor::has_class
*
* @dataProvider data_html_with_variations_of_class_values_and_sought_class_names
*
* @param string $html Contains a tag optionally containing a `class` attribute.
* @param string $sought_class Name of class to find in the input tag's `class`.
* @param bool $has_class Whether the sought class exists in the given HTML.
*/
public function test_has_class_handles_expected_class_name_variations( $html, $sought_class, $has_class ) {
$p = new WP_HTML_Tag_Processor( $html );
$p->next_tag();

if ( $has_class ) {
$this->assertTrue( $p->has_class( $sought_class ), "Failed to find expected class {$sought_class}." );
} else {
$this->assertFalse( $p->has_class( $sought_class ), "Found class {$sought_class} when it doesn't exist." );
}
}

/**
* Data provider.
*
* @return array[]
*/
public function data_html_with_variations_of_class_values_and_sought_class_names() {
return array(
'Tag without any classes' => array( '<div>', 'foo', false ),
'Tag with boolean class' => array( '<img class>', 'foo', false ),
'Tag with empty class' => array( '<p class="">', 'foo', false ),
'Tag with exact match' => array( '<button class="foo">', 'foo', true ),
'Tag with duplicate matches' => array( '<span class="foo bar foo">', 'foo', true ),
'Tag with non-initial match' => array( '<section class="bar foo">', 'foo', true ),
'Tag with encoded match' => array( '<main class="&hellip;">', '', true ),
'Class with tab separator' => array( "<div class='one\ttwo'>", 'two', true ),
'Class with newline separator' => array( "<div class='one\ntwo\n'>", 'two', true ),
'False duplicate attribute' => array( '<img class=dog class=cat>', 'cat', false ),
);
}

/**
* Ensures that the invalid comment closing syntax "--!>" properly closes a comment.
*
Expand Down

0 comments on commit cecc810

Please sign in to comment.