From a4cb89abc43e5585235a191735cee35997916721 Mon Sep 17 00:00:00 2001 From: Chlod Alejandro Date: Fri, 16 Aug 2024 03:08:33 +0800 Subject: [PATCH] Process masked external links in tag wikitext Adds a `$includeExternalLinks` option to `parseWikitext` which enables parsing masked external links (`[https://w.wiki/M like this]`). This fixes tags showing up as raw wikitext (with a clickable link) instead of a properly-displaying tag. Edit summaries don't have this functionality, matching the behavior of MediaWiki. Also adds additional tests and `rel="nofollow"` to external links. Bug: T372531 --- src/Model/Record.php | 55 +++++++++++++++++++++++++++++++++----- tests/Model/RecordTest.php | 39 +++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/src/Model/Record.php b/src/Model/Record.php index 7f1dc75..f463361 100644 --- a/src/Model/Record.php +++ b/src/Model/Record.php @@ -192,7 +192,7 @@ public function getTags(): array { */ public function getTagLabels(): array { return array_map( function ( $tag ) { - return $this->parseWikitext( $tag ); + return $this->parseWikitext( $tag, true ); }, $this->data['tags_labels'] ?? [] ); } @@ -202,18 +202,61 @@ public function getTagLabels(): array { * * @see https://github.com/x-tools/xtools/blob/4795fb88dd392bb0474219be3ef9a1fc019a228b/src/Model/Edit.php#L336 * @param string $wikitext + * @param bool $includeExternalLinks Whether to include masked external links as part of parsing. * @return string */ - public function parseWikitext( string $wikitext ): string { + public function parseWikitext( string $wikitext, bool $includeExternalLinks = false ): string { $wikitext = htmlspecialchars( html_entity_decode( $wikitext ), ENT_NOQUOTES ); + // Hold a list of tokens so that we don't end up replacing the same thing twice. + $tokenList = []; + + // This regex is from https://stackoverflow.com/a/6041965/604142 + // This should only have one capture group: the whole URL. + // Ensure all other groups are (?:non-capturing). + $urlRegex = '\b((?:[\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|(?:[^[:punct:]\s]|/)))'; + + // Process masked external links, if requested. + // This goes before we process raw links, so that we don't convert both. + if ( $includeExternalLinks ) { + $wikitext = preg_replace_callback( + "%\[$urlRegex ([^]]+)]%s", + static function ( $matches ) use ( &$tokenList, $urlRegex ) { + // Do not convert if label URL match is `1` (is a URL) or + // `false` (failure), for safety + if ( preg_match( "%$urlRegex%s", $matches[2] ) !== 0 ) { + return $matches[0]; + } + + do { + $id = rand(); + } while ( isset( $tokenList[$id] ) ); + $token = ''; + $tokenList[$id] = "${matches[2]}"; + return $token; + }, + $wikitext + ); + } - // First link raw URLs. Courtesy of https://stackoverflow.com/a/11641499/604142 - $wikitext = preg_replace( - '%\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))%s', - '$1', + // Link raw URLs. + $wikitext = preg_replace_callback( + "%$urlRegex%s", + static function ( $matches ) use ( &$tokenList ) { + do { + $id = rand(); + } while ( isset( $tokenList[$id] ) ); + $token = ''; + $tokenList[$id] = "${matches[1]}"; + return $token; + }, $wikitext ); + // Replace all tokens from previous two steps. + foreach ( $tokenList as $id => $replacement ) { + $wikitext = str_replace( '', $replacement, $wikitext ); + } + $sectionMatch = null; $isSection = preg_match_all( "/^\/\* (.*?) \*\//", $wikitext, $sectionMatch ); $pageUrl = $this->getPageUrl(); diff --git a/tests/Model/RecordTest.php b/tests/Model/RecordTest.php index d979039..f944acc 100644 --- a/tests/Model/RecordTest.php +++ b/tests/Model/RecordTest.php @@ -173,15 +173,50 @@ public function testStatusJson(): void { } public function testParseWikitext(): void { + // XSS static::assertEquals( "<script>alert(\"XSS baby\")</script> " . "test page", $this->record->parseWikitext( ' [[test page]]' ) ); + // Wikilink static::assertEquals( - 'https://example.org', - $this->record->parseWikitext( 'https://example.org' ) + "MediaWiki", + $this->record->parseWikitext( "[[MediaWiki]]" ) + ); + + // Wikilink (starting with `:`) + static::assertEquals( + "MediaWiki", + $this->record->parseWikitext( "[[:MediaWiki]]" ) + ); + + // Raw link + static::assertEquals( + "https://example.org", + $this->record->parseWikitext( "https://example.org" ) + ); + + // Masked external link + static::assertEquals( + "[https://example.org test]", + $this->record->parseWikitext( "[https://example.org test]" ) + ); + + // == WITH MASKED EXTERNAL LINKS == + + // Masked external link + static::assertEquals( + "test", + $this->record->parseWikitext( "[https://example.org test]", true ) + ); + + // Misleading masked external link + static::assertEquals( + "[https://evil.example.org " + . "https://example.org]", + $this->record->parseWikitext( "[https://evil.example.org https://example.org]", true ) ); }