Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Play: Add various HTML API utilities and debuggers. #6832

Draft
wants to merge 10 commits into
base: trunk
Choose a base branch
from
176 changes: 176 additions & 0 deletions highlight-html.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
<?php

require_once __DIR__ . '/src/wp-load.php';

define( 'C_TEXT', "\e[m" );
define( 'C_REF', "\e[38;2;154;110;58m" );
define( 'C_COMMENT', "\e[38;2;112;128;144m" );
define( 'C_SYNTAX', "\e[38;2;153;153;153m" );
define( 'C_TAGNAME', "\e[38;2;153;0;85m" );
define( 'C_ANAME', "\e[38;2;102;153;0m" );
define( 'C_AVALUE', "\e[38;2;0;119;170m" );

$options = getopt( 'fu:' ); // How rude!
$do_format = isset( $options['f'] );

$uri = 'php://stdin';
if ( isset( $options['u'] ) ) {
$uri = $options['u'];
if ( ! preg_match( '~^https?://~', $uri ) ) {
$uri = "https://{$uri}";
}
}

$html = file_get_contents( $uri );

$p = new class( $html ) extends WP_HTML_Processor {
public function get_raw_token() {
$this->set_bookmark('here');
$here = $this->bookmarks['_here'];
return substr( $this->html, $here->start, $here->length );
}
};

$p = $p::create_Full_parser( $html );

while ( $p->next_token() ) {
switch ( $p->get_token_type() ) {
case '#comment':
echo C_COMMENT . '<!--' . $p->get_modifiable_text() . '-->';
break;

case '#doctype':
echo C_SYNTAX . '<!DOCTYPE' . $p->get_modifiable_text() . '>';
break;

case '#tag':
print_tag( $p );
break;

case '#text':
print_text( $p );
break;

default:
die( "Unsupported syntax: {$p->get_token_type()}" );
}
}

echo "\e[m\n";

function print_text( $p ) {
$token_name = $p->get_token_name();
if ( in_array( $token_name, [ 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ], true ) ) {
return C_TEXT . $p->get_modifiable_text();
}

$raw_token = $p->get_raw_token();
$text = C_TEXT;
$at = 0;
$was_at = 0;
while ( $at < strlen( $raw_token ) ) {
$next_at = strpos( $raw_token, '&', $at );
if ( false === $next_at ) {
break;
}

$replacement = WP_HTML_Decoder::read_character_reference( 'data', $raw_token, $next_at, $skip_bytes );
if ( isset( $replacement ) ) {
$text .= substr( $raw_token, $was_at, $next_at - $was_at ) . C_REF . substr( $raw_token, $next_at, $skip_bytes ) . C_TEXT;
$at = $next_at + $skip_bytes;
$was_at = $at;
continue;
}

++$at;
}
if ( $was_at < strlen( $raw_token ) ) {
$text .= substr( $raw_token, $was_at );
}
echo C_TEXT . $text;
}

function print_tag( $p ) {
global $do_format;

static $depth = 0;

$tag_name = $p->get_tag();
$is_closer = $p->is_tag_closer();
$closer = $is_closer ? '/' : '';
$is_void = WP_HTML_Processor::is_void( $tag_name );
$voider = $is_void ? '/' : '';

if ( $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) {
$depth--;
}

$indent = str_pad( '', $depth * 2, ' ' );

if ( $do_format && (
(
! $is_closer && in_array( $tag_name, [
'DIV', 'P', 'UL', 'OL', 'DETAILS', 'SVG', 'PATH', 'G',
'LINK', 'META', 'HTML', 'HEAD', 'BODY', 'TITLE', 'TEXTAREA',
'PRE', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HGROUP',
'PICTURE', 'SOURCE', 'FIGURE', 'FORM', 'TABLE', 'TR',
'FIGCAPTION', 'BLOCKQUOTE', 'OBJECT', 'EMBED', 'IFRAME',
'SCRIPT', 'STYLE', 'NOSCRIPT', 'NAV', 'LI'
], true )
) || (
$is_closer && in_array( $tag_name, [
'HEAD', 'HTML', 'BODY', 'PICTURE', 'FIGURE', 'TABLE'
], true )
)
) ) {
echo "\n{$indent}";
}
echo C_SYNTAX . '<' . $closer;

echo C_TAGNAME . strtolower( $p->get_tag() );
$attributes = $p->get_attribute_names_with_prefix( '' ) ?? array();

foreach( $attributes as $name ) {
$value = $p->get_attribute( $name );

echo ' ' . C_ANAME . $name;
if ( true === $value ) {
continue;
}

echo C_SYNTAX . '="';
echo C_AVALUE . str_replace( '"', '&quot;', $value );
echo C_SYNTAX . '"';
}
echo C_SYNTAX . '>';

$text = $p->get_modifiable_text();
if ( ! empty( $text ) ) {
echo 'TITLE' === $p->get_tag() ? C_TEXT : C_COMMENT;

$add_newlines = (
$do_format &&
strlen( trim( $text ) ) > 0 &&
(
'SCRIPT' === $tag_name ||
'STYLE' === $tag_name ||
'TEXTAREA' === $tag_name ||
'PRE' === $tag_name
)
);

if ( $add_newlines ) {
echo "\n" . trim( $text, "\n" ) . "\n";
} else {
echo $text;
}

echo C_SYNTAX . '</' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>';
} elseif ( in_array( $tag_name, [ 'SCRIPT', 'STYLE', 'TEXTAREA', 'PRE' ], true ) ) {
echo C_SYNTAX . '</' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>';
}

if ( ! $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) {
$depth++;
}
}
187 changes: 187 additions & 0 deletions html-grep.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
<?php

require_once( __DIR__ . '/src/wp-load.php' );

function main() {
global $argv;

$opts = getopt( 'A:B:p:i:m:', [] );

if ( ! isset( $opts['p'] ) ) {
die( 'Please supply a search pattern with -p, e.g. `-p "[a-f0-9]+"`' );
}

if ( ! isset( $opts['i'] ) && ! in_array( '-', $argv, true ) ) {
die( 'Please specify input filename with -i or use stdin with -, e.g. `-i file.html`' );
}

$lines_before = ctype_digit( $opts['B'] ?? '' ) ? intval( $opts['B'] ) : 0;
$lines_after = ctype_digit( $opts['A'] ?? '' ) ? intval( $opts['A'] ) : 0;

$max = ( isset( $opts['m'] ) && ctype_digit( $opts['m'] ) && (int) $opts['m'] > 0 )
? (int) $opts['m']
: 1;

$input = in_array( '-', $argv, true ) ? 'php://stdin' : $opts['i'];
Grepper::scan( $input, $opts['p'], $lines_before, $lines_after, $max );
}

class Debugger extends WP_HTML_Tag_Processor {
public function h() {
return $this->html;
}

public function extend( $line ) {
$this->html .= $line;

if (
$this->parser_state === self::STATE_COMPLETE ||
$this->parser_state === self::STATE_INCOMPLETE_INPUT
) {
$this->parser_state = self::STATE_READY;
}
}

public function next_token() {
$r = parent::next_token();
$this->set_bookmark( 'here' );
return $r;
}

public function at() {
return $this->bookmarks['here'];
}
}

class Grepper {
public static function scan( $input, $pattern, $before, $after, $max ) {
$f = fopen( $input, 'r' );
$c = 0;
$n = 0;
$lines = [];
$lc = 1 + $before + $after;
$o = static function ( $s ) { return WP_HTML_Decoder::decode_text_node( $s ); };
$ws = static function ( $s ) { return preg_replace( '~[ \r\f\t\n]+~', ' ', $s ); };
$pre_depth = 0;
$p = new Debugger( '' );
$t = '';

while ( false !== ( $line = fgets( $f ) ) ) {
$n++;

$p->extend( $line );
while ( $p->next_token() ) {
$at = $p->at();
$type = $p->get_token_type();
$node_text = $o( $p->get_modifiable_text() );
$node_text = $pre_depth > 0 ? $node_text : $ws( $node_text );

if ( '#tag' !== $type && '#text' !== $type ) {
continue;
}

switch ( $p->get_token_name() ) {
case 'PRE':
$pre_depth += $p->is_tag_closer() ? -1 : 1;
break;

case '#text':
$t .= $node_text;
}

if ( preg_match( $pattern, $t, $match, PREG_OFFSET_CAPTURE ) ) {
$h = (
"\e[32m" .
ltrim( mb_strcut( $t, 0, $match[0][1] ) ) .
"\e[33m" .
$match[0][0] .
"\e[32m" .
rtrim( mb_strcut( $t, $match[0][1] + strlen( $match[0][0] ) ) ) .
"\e[90m"
);

for ( $i = 0; $i < $after; $i++ ) {
$line = fgets( $f );
if ( false !== $line ) {
$p->extend( $line );
}
}

$cb = mb_strcut( $p->h(), 0, $at->start );
$cc = mb_strcut( $p->h(), $at->start, $at->length );
$ca = mb_strcut( $p->h(), $at->start + $at->length );

// Limit context to N lines preview
$cb = explode( "\n", $cb );
$cb = array_slice( $cb, -$before );
$cb = mb_strcut( implode( "\n", $cb ), -$before * 80 );

// Limit context to N lines preview
$ca = explode( "\n", $ca );
$ca = array_slice( $ca, 0, $after );
$ca = mb_strcut( implode( "\n", $ca ), 0, $after * 80 );

// If contained in last node.
$tt = $p->get_modifiable_text();
if ( preg_match( $pattern, $tt, $mm, PREG_OFFSET_CAPTURE ) ) {
$cc = (
"\e[90m" .
mb_strcut( $tt, 0, $mm[0][1] ) .
"\e[33m" .
$mm[0][0] .
"\e[90m" .
mb_strcut( $tt, $mm[0][1] + strlen( $mm[0][0] ) )
);
}

echo "\n\e[32m{$n}\e[90m: \e[31m{$p->get_token_name()} \e[90m{$h}\e[m\n";
echo "\e[90m{$cb}\e[33m{$cc}\e[90m{$ca}\e[m";

if ( ++$c >= $max ) {
fclose( $f );
exit;
}

$t = '';
}

$t = mb_strcut( $t, -100 );
}

}
}

public static function indent( $lines ) {
return implode( "\n", array_map(
static function ( $line ) { return ' ' . $line; },
explode( "\n", $lines )
) );
}
}

main();

function is_line_breaker( $tag_name ) {
switch ( $tag_name ) {
case 'BLOCKQUOTE':
case 'BR':
case 'DD':
case 'DIV':
case 'DL':
case 'DT':
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'HR':
case 'LI':
case 'OL':
case 'P':
case 'UL':
return true;
}

return false;
}
Loading
Loading