diff --git a/highlight-html.php b/highlight-html.php new file mode 100644 index 0000000000000..1ddb46b621f30 --- /dev/null +++ b/highlight-html.php @@ -0,0 +1,176 @@ +set_bookmark('here'); + $here = $this->bookmarks['_here']; + return substr( $this->html, $here->start, $here->length ); + } +}; + +$p = $p::create_Full_parser( $html ); + +while ( $p->next_token() ) { + switch ( $p->get_token_type() ) { + case '#comment': + echo C_COMMENT . ''; + break; + + case '#doctype': + echo C_SYNTAX . 'get_modifiable_text() . '>'; + break; + + case '#tag': + print_tag( $p ); + break; + + case '#text': + print_text( $p ); + break; + + default: + die( "Unsupported syntax: {$p->get_token_type()}" ); + } +} + +echo "\e[m\n"; + +function print_text( $p ) { + $token_name = $p->get_token_name(); + if ( in_array( $token_name, [ 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ], true ) ) { + return C_TEXT . $p->get_modifiable_text(); + } + + $raw_token = $p->get_raw_token(); + $text = C_TEXT; + $at = 0; + $was_at = 0; + while ( $at < strlen( $raw_token ) ) { + $next_at = strpos( $raw_token, '&', $at ); + if ( false === $next_at ) { + break; + } + + $replacement = WP_HTML_Decoder::read_character_reference( 'data', $raw_token, $next_at, $skip_bytes ); + if ( isset( $replacement ) ) { + $text .= substr( $raw_token, $was_at, $next_at - $was_at ) . C_REF . substr( $raw_token, $next_at, $skip_bytes ) . C_TEXT; + $at = $next_at + $skip_bytes; + $was_at = $at; + continue; + } + + ++$at; + } + if ( $was_at < strlen( $raw_token ) ) { + $text .= substr( $raw_token, $was_at ); + } + echo C_TEXT . $text; +} + +function print_tag( $p ) { + global $do_format; + + static $depth = 0; + + $tag_name = $p->get_tag(); + $is_closer = $p->is_tag_closer(); + $closer = $is_closer ? '/' : ''; + $is_void = WP_HTML_Processor::is_void( $tag_name ); + $voider = $is_void ? '/' : ''; + + if ( $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) { + $depth--; + } + + $indent = str_pad( '', $depth * 2, ' ' ); + + if ( $do_format && ( + ( + ! $is_closer && in_array( $tag_name, [ + 'DIV', 'P', 'UL', 'OL', 'DETAILS', 'SVG', 'PATH', 'G', + 'LINK', 'META', 'HTML', 'HEAD', 'BODY', 'TITLE', 'TEXTAREA', + 'PRE', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HGROUP', + 'PICTURE', 'SOURCE', 'FIGURE', 'FORM', 'TABLE', 'TR', + 'FIGCAPTION', 'BLOCKQUOTE', 'OBJECT', 'EMBED', 'IFRAME', + 'SCRIPT', 'STYLE', 'NOSCRIPT', 'NAV', 'LI' + ], true ) + ) || ( + $is_closer && in_array( $tag_name, [ + 'HEAD', 'HTML', 'BODY', 'PICTURE', 'FIGURE', 'TABLE' + ], true ) + ) + ) ) { + echo "\n{$indent}"; + } + echo C_SYNTAX . '<' . $closer; + + echo C_TAGNAME . strtolower( $p->get_tag() ); + $attributes = $p->get_attribute_names_with_prefix( '' ) ?? array(); + + foreach( $attributes as $name ) { + $value = $p->get_attribute( $name ); + + echo ' ' . C_ANAME . $name; + if ( true === $value ) { + continue; + } + + echo C_SYNTAX . '="'; + echo C_AVALUE . str_replace( '"', '"', $value ); + echo C_SYNTAX . '"'; + } + echo C_SYNTAX . '>'; + + $text = $p->get_modifiable_text(); + if ( ! empty( $text ) ) { + echo 'TITLE' === $p->get_tag() ? C_TEXT : C_COMMENT; + + $add_newlines = ( + $do_format && + strlen( trim( $text ) ) > 0 && + ( + 'SCRIPT' === $tag_name || + 'STYLE' === $tag_name || + 'TEXTAREA' === $tag_name || + 'PRE' === $tag_name + ) + ); + + if ( $add_newlines ) { + echo "\n" . trim( $text, "\n" ) . "\n"; + } else { + echo $text; + } + + echo C_SYNTAX . '' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>'; + } elseif ( in_array( $tag_name, [ 'SCRIPT', 'STYLE', 'TEXTAREA', 'PRE' ], true ) ) { + echo C_SYNTAX . '' . C_TAGNAME . strtolower( $p->get_tag() ) . C_SYNTAX . '>'; + } + + if ( ! $is_closer && in_array( $tag_name, [ 'HEAD', 'BODY', 'OL', 'UL', 'DIV' ], true ) ) { + $depth++; + } +} diff --git a/html-grep.php b/html-grep.php new file mode 100644 index 0000000000000..07741d6f5624e --- /dev/null +++ b/html-grep.php @@ -0,0 +1,187 @@ + 0 ) + ? (int) $opts['m'] + : 1; + + $input = in_array( '-', $argv, true ) ? 'php://stdin' : $opts['i']; + Grepper::scan( $input, $opts['p'], $lines_before, $lines_after, $max ); +} + +class Debugger extends WP_HTML_Tag_Processor { + public function h() { + return $this->html; + } + + public function extend( $line ) { + $this->html .= $line; + + if ( + $this->parser_state === self::STATE_COMPLETE || + $this->parser_state === self::STATE_INCOMPLETE_INPUT + ) { + $this->parser_state = self::STATE_READY; + } + } + + public function next_token() { + $r = parent::next_token(); + $this->set_bookmark( 'here' ); + return $r; + } + + public function at() { + return $this->bookmarks['here']; + } +} + +class Grepper { + public static function scan( $input, $pattern, $before, $after, $max ) { + $f = fopen( $input, 'r' ); + $c = 0; + $n = 0; + $lines = []; + $lc = 1 + $before + $after; + $o = static function ( $s ) { return WP_HTML_Decoder::decode_text_node( $s ); }; + $ws = static function ( $s ) { return preg_replace( '~[ \r\f\t\n]+~', ' ', $s ); }; + $pre_depth = 0; + $p = new Debugger( '' ); + $t = ''; + + while ( false !== ( $line = fgets( $f ) ) ) { + $n++; + + $p->extend( $line ); + while ( $p->next_token() ) { + $at = $p->at(); + $type = $p->get_token_type(); + $node_text = $o( $p->get_modifiable_text() ); + $node_text = $pre_depth > 0 ? $node_text : $ws( $node_text ); + + if ( '#tag' !== $type && '#text' !== $type ) { + continue; + } + + switch ( $p->get_token_name() ) { + case 'PRE': + $pre_depth += $p->is_tag_closer() ? -1 : 1; + break; + + case '#text': + $t .= $node_text; + } + + if ( preg_match( $pattern, $t, $match, PREG_OFFSET_CAPTURE ) ) { + $h = ( + "\e[32m" . + ltrim( mb_strcut( $t, 0, $match[0][1] ) ) . + "\e[33m" . + $match[0][0] . + "\e[32m" . + rtrim( mb_strcut( $t, $match[0][1] + strlen( $match[0][0] ) ) ) . + "\e[90m" + ); + + for ( $i = 0; $i < $after; $i++ ) { + $line = fgets( $f ); + if ( false !== $line ) { + $p->extend( $line ); + } + } + + $cb = mb_strcut( $p->h(), 0, $at->start ); + $cc = mb_strcut( $p->h(), $at->start, $at->length ); + $ca = mb_strcut( $p->h(), $at->start + $at->length ); + + // Limit context to N lines preview + $cb = explode( "\n", $cb ); + $cb = array_slice( $cb, -$before ); + $cb = mb_strcut( implode( "\n", $cb ), -$before * 80 ); + + // Limit context to N lines preview + $ca = explode( "\n", $ca ); + $ca = array_slice( $ca, 0, $after ); + $ca = mb_strcut( implode( "\n", $ca ), 0, $after * 80 ); + + // If contained in last node. + $tt = $p->get_modifiable_text(); + if ( preg_match( $pattern, $tt, $mm, PREG_OFFSET_CAPTURE ) ) { + $cc = ( + "\e[90m" . + mb_strcut( $tt, 0, $mm[0][1] ) . + "\e[33m" . + $mm[0][0] . + "\e[90m" . + mb_strcut( $tt, $mm[0][1] + strlen( $mm[0][0] ) ) + ); + } + + echo "\n\e[32m{$n}\e[90m: \e[31m{$p->get_token_name()} \e[90m{$h}\e[m\n"; + echo "\e[90m{$cb}\e[33m{$cc}\e[90m{$ca}\e[m"; + + if ( ++$c >= $max ) { + fclose( $f ); + exit; + } + + $t = ''; + } + + $t = mb_strcut( $t, -100 ); + } + + } + } + + public static function indent( $lines ) { + return implode( "\n", array_map( + static function ( $line ) { return ' ' . $line; }, + explode( "\n", $lines ) + ) ); + } +} + +main(); + +function is_line_breaker( $tag_name ) { + switch ( $tag_name ) { + case 'BLOCKQUOTE': + case 'BR': + case 'DD': + case 'DIV': + case 'DL': + case 'DT': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'HR': + case 'LI': + case 'OL': + case 'P': + case 'UL': + return true; + } + + return false; +} diff --git a/html-to-text.php b/html-to-text.php new file mode 100644 index 0000000000000..93a2dc83355ac --- /dev/null +++ b/html-to-text.php @@ -0,0 +1,374 @@ +next_token() ) { + $node_name = $p->get_token_name(); + + if ( in_array( strtolower( $node_name ), $skip_first_nodes, true ) ) { + $depth = $p->get_current_depth(); + while ( $p->get_current_depth() >= $depth ) { + $p->next_token(); + } + array_shift( $skip_first_nodes ); + continue; + } + + if ( in_array( strtolower( $node_name ), $skip_nodes, true ) ) { + $depth = $p->get_current_depth(); + while ( $p->get_current_depth() >= $depth ) { + $p->next_token(); + } + continue; + } + + $node_text = WP_HTML_Decoder::decode_text_node( $p->get_modifiable_text() ); + $tag_name = '#tag' === $p->get_token_type() + ? ( ( $p->is_tag_closer() ? '-' : '+' ) . $node_name ) + : $node_name; + + if ( '#tag' === $p->get_token_type() && ! $p->is_tag_closer() && is_line_breaker( $node_name ) ) { + $needs_newline = ! $prev_was_li; + } + + if ( $ansi ) { + if ( + '+MAIN' === $tag_name || + 'main' === $p->get_attribute( 'role' ) || + 'main-content' === $p->get_attribute( 'id' ) || // cloudflare. + 'hnmain' === $p->get_attribute( 'id' ) // Hackernews. + ) { + $text_content .= "\e]1337;SetMark\x07"; + } + + switch ( $tag_name ) { + case '+A': + $href = $p->get_attribute( 'href' ); + if ( is_string( $href ) && preg_match( '~^https?://~', $href ) ) { + // External link, probably. + $text_content .= "\e[32m\e]8;;{$href}\x07"; + } elseif ( str_starts_with( $href, 'javascript:' ) ) { + break; + } else { + // Internal link, probably. + $text_content .= "\e[90m\e]8;;{$base_url}{$href}\x07"; + } + break; + + case '-A': + $text_content .= "\e]8;;\x07\e[m"; + break; + + case '+B': + case '+STRONG': + $text_content .= "\e[2m"; + break; + + case '-B': + case '-STRONG': + $text_content .= "\e[22m"; + break; + + case '+C-': + $rgb = color_for_syntax_element( $p ); + if ( null !== $rgb ) { + $text_content .= "\e[38;2;{$rgb[0]};{$rgb[1]};{$rgb[2]}m"; + } + break; + + case '-C-': + $text_content .= "\e[m"; + break; + + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + $text_content .= "\e[1m"; + break; + + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + $text_content .= "\e[22m"; + break; + + case '+I': + case '+EM': + $text_content .= "\e[3m"; + break; + + case '-I': + case '-EM': + $text_content .= "\e[23m"; + break; + + case '+SUB': + $text_content .= "\e[74m"; + break; + + case '+SUP': + $text_content .= "\e[73m"; + break; + + case '-SUB': + case '-SUP': + $text_content .= "\e[75m"; + break; + + case '+TITLE': + $text_content .= "\e]0;{$node_text}\x07"; + break; + } + } + + switch ( $tag_name ) { + case '+LI': + $text_content .= "\n \e[31m•\e[39m "; + $needs_newline = false; + break; + + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + $text_content .= "\n\n" . str_pad( '', intval( $node_name[1] ), '#' ) . ' '; + $needs_newline = false; + break; + + case '+CITE': + $text_content .= ' «'; + break; + + case '-CITE': + $text_content .= '»'; + break; + + case '+CODE': + case '-CODE': + if ( $ansi && ! $p->is_tag_closer() ) { + $text_content .= "\e[90m"; + } + if ( $in_pre ) { + $text_content .= $p->is_tag_closer() ? "\n```" : "\n```\n"; + } else { + $text_content .= '`'; + } + if ( $ansi && $p->is_tag_closer() ) { + $text_content .= "\e[m"; + } + break; + + case '+DT': + $text_content .= "\n\n✏️ "; + $needs_newline = false; + break; + + case '+DD': + $text_content .= "\n 📝 "; + $needs_newline = false; + break; + + case '+IMG': + $alt = $p->get_attribute( 'alt' ); + if ( is_string( $alt ) && ! empty( $alt ) ) { + $text_content .= "[\e[31m{$alt}\e[m]"; + } + break; + + case '+PRE': + case '-PRE': + if ( $p->is_tag_closer() ) { + $in_pre = false; + $text_content .= "\e[90m```\e[m\n"; + } else { + $in_pre = true; + $text_content .= "\n\n\e[90m```"; + $lang = $p->get_attribute( 'lang' ); + if ( is_string( $lang ) ) { + $text_content .= $lang; + } + $text_content .= "\e[m\n"; + } + + break; + + case '+TABLE': + $text_content .= "\n\n"; + break; + + case '+TH': + $text_content .= "\e[1;3m"; + break; + + case '-TD': + case '-TH': + $text_content .= "\t\e[0;90m|\e[m "; + break; + + case '+TR': + $text_content .= "\e[90m| \e[m"; + break; + + case '-TR': + $text_content .= "\e[90m |\e[m\n"; + break; + + case '#text': + if ( $needs_newline ) { + $text_content .= "\n\n"; + $needs_newline = false; + } + $text_content .= $in_pre ? $node_text : preg_replace( '~[ \t\r\f\n]+~', ' ', $node_text ); + } + + $prev_was_li = '+LI' === $tag_name; +} + +echo trim( $text_content ); + +if ( null !== $p->get_last_error() ) { + echo "\n\e[31mFailed\e[90m because of '\e[2,31m{$p->get_last_error()}\e[0,90m'\e[m\n"; + $unsupported = $p->get_unsupported_exception(); + if ( isset( $unsupported ) ) { + echo "\e[90m ┤ {$unsupported->getMessage()}\e[m\n"; + } +} else if ( $p->paused_at_incomplete_token() ) { + echo trim( $text_content ); + echo "\n\e[31mIncomplete input\e[90m found at end of document; unable to proceed.\e[m\n"; +} + +function is_line_breaker( $tag_name ) { + switch ( $tag_name ) { + case 'BLOCKQUOTE': + case 'BR': + case 'DD': + case 'DIV': + case 'DL': + case 'DT': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'HR': + case 'LI': + case 'OL': + case 'P': + case 'UL': + return true; + } + + return false; +} + +function color_for_syntax_element( $processor ) { + static $colors = [ + 'a' => [0x99, 0x00, 0x55], + 'b' => [0x99, 0x00, 0x55], + 'c' => [0x70, 0x80, 0x90], + 'd' => [0x70, 0x80, 0x90], + 'e' => [0x00, 0x77, 0xaa], + 'f' => [0x66, 0x99, 0x00], + 'g' => [0x22, 0x22, 0x22], + 'k' => [0x99, 0x00, 0x55], + 'l' => [0x00, 0x00, 0x00], + 'm' => [0x00, 0x00, 0x00], + 'n' => [0x00, 0x77, 0xaa], + 'o' => [0x99, 0x99, 0x99], + 'p' => [0x99, 0x99, 0x99], + 's' => [0xa6, 0x7f, 0x59], + 't' => [0xa6, 0x7f, 0x59], + 'u' => [0xa6, 0x7f, 0x59], + 'cp' => [0x70, 0x80, 0x90], + 'c1' => [0x70, 0x80, 0x90], + 'cs' => [0x70, 0x80, 0x90], + 'kc' => [0x99, 0x00, 0x55], + 'kn' => [0x99, 0x00, 0x55], + 'kp' => [0x99, 0x00, 0x55], + 'kr' => [0x99, 0x00, 0x55], + 'ld' => [0x00, 0x00, 0x00], + 'nc' => [0x00, 0x77, 0xaa], + 'no' => [0x00, 0x77, 0xaa], + 'nd' => [0x00, 0x77, 0xaa], + 'ni' => [0x00, 0x77, 0xaa], + 'ne' => [0x00, 0x77, 0xaa], + 'nf' => [0x00, 0x77, 0xaa], + 'nl' => [0x00, 0x77, 0xaa], + 'nn' => [0x00, 0x77, 0xaa], + 'py' => [0x00, 0x77, 0xaa], + 'ow' => [0x99, 0x99, 0x99], + 'mb' => [0x00, 0x00, 0x00], + 'mf' => [0x00, 0x00, 0x00], + 'mh' => [0x00, 0x00, 0x00], + 'mi' => [0x00, 0x00, 0x00], + 'mo' => [0x00, 0x00, 0x00], + 'sb' => [0xa6, 0x7f, 0x59], + 'sc' => [0xa6, 0x7f, 0x59], + 'sd' => [0xa6, 0x7f, 0x59], + 'se' => [0xa6, 0x7f, 0x59], + 'sh' => [0xa6, 0x7f, 0x59], + 'si' => [0xa6, 0x7f, 0x59], + 'sx' => [0xa6, 0x7f, 0x59], + 'sr' => [0xa6, 0x7f, 0x59], + 'ss' => [0xa6, 0x7f, 0x59], + 'vc' => [0x00, 0x77, 0xaa], + 'vg' => [0x00, 0x77, 0xaa], + 'vi' => [0x00, 0x77, 0xaa], + 'il' => [0x00, 0x00, 0x00], + ]; + + foreach ( $colors as $name => $rgb ) { + if ( $processor->get_attribute( $name ) ) { + return $rgb; + } + } + + return null; +} diff --git a/parse-tokens.php b/parse-tokens.php new file mode 100644 index 0000000000000..b291918594b8e --- /dev/null +++ b/parse-tokens.php @@ -0,0 +1,89 @@ + +
This is like , , and and .
+