From 75d628b238fe1eef1a95808224c5527369f060ad Mon Sep 17 00:00:00 2001 From: Arnt Gulbrandsen Date: Mon, 23 Sep 2024 14:20:09 +0200 Subject: [PATCH] General: Add support for most unicode letters in WP backend login names This accepts user names that contain a single script, but not mixed-script names, such as ones that mix Latin and Cyrillic. That seemed to be closest to the code's existing philosophy. --- src/wp-includes/formatting.php | 114 +++++++++++++++++- .../phpunit/tests/formatting/sanitizeUser.php | 22 +++- 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index d28973f9b16ae..0c549e7f5359d 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -2123,6 +2123,113 @@ function sanitize_file_name( $filename ) { return apply_filters( 'sanitize_file_name', $filename, $filename_raw ); } +/** + * Returns true if the string contains no more than one unicode + * script, and false if it contains two or more. This only considers + * alphabetic characters. + * + * This returns true for an empty string. + * + + * IntlChar does not support returning the script property defined by + * https://www.unicode.org/reports/tr24/, so this implementation uses + * a workaround. Some of the old scripts have several code blocks, but + * the scripts currently being added have only one, since the + * committee has grown better at estimating the necessary size. + + * This maps the known extension blocks ("latin extended a" etc) to + * the first block for that script, and then checks that the string + * uses only a single block. This works for the scripts currently in + * Unicode, and will work for future scripts as long as the committee + * keeps estimating correctly. + + */ + +function uses_single_unicode_script( $input ) { + $block = 0; + foreach ( mb_str_split($input) as $cp ) { + if(IntlChar::isalpha($cp)) { + $b = IntlChar::getBlockCode($cp); + switch($b) { + case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT: + case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A: + case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B: + case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C: + case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D: + case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc + case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL: + $b = IntlChar::BLOCK_CODE_BASIC_LATIN; + break; + case IntlChar::BLOCK_CODE_GREEK_EXTENDED: + case IntlChar::BLOCK_CODE_COPTIC: + case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS: + // Greek and coptic overlap. Coptic looks like Greek + // upper case, so readers of Greek can read Coptic, + // but readers of Coptic can't necessarily read + // Greek. This led to an unfortunate situation in + // Unicode, where the two can't be properly + // distinguished by block. However, because of the + // overlap, this isn't really a problem. + $b = IntlChar::BLOCK_CODE_GREEK; + case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED: + case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A: + case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT: + $b = IntlChar::BLOCK_CODE_ETHIOPIC; + break; + case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A: + case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT: + case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A: + case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B: + case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT: + $b = IntlChar::BLOCK_CODE_ARABIC; + break; + case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A: + case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B: + $b = IntlChar::BLOCK_CODE_CYRILLIC; + break; + case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED: + $b = IntlChar::BLOCK_CODE_BOPOMOFO; + break; + case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED: + $b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS; + break; + case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED: + $b = IntlChar::BLOCK_CODE_DEVANAGARI; + break; + case IntlChar::BLOCK_CODE_HANGUL_JAMO: + case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A: + case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B: + $b = IntlChar::BLOCK_CODE_HANGUL; + break; + case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A: + case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B: + $b = IntlChar::BLOCK_CODE_MYANMAR; + break; + case IntlChar::BLOCK_CODE_CJK_STROKES: + case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS: + case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: + case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: + case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C: + case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D: + case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS: + case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT: + case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS: + case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS: + case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT: + $b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS; + break; + } + if($block == 0) { + $block = $b; + } + if($block != $b) { + return false; + } + } + } + return true; +} + /** * Sanitizes a username, stripping out unsafe characters. * @@ -2143,10 +2250,15 @@ function sanitize_user( $username, $strict = false ) { $username = wp_strip_all_tags( $username ); $username = remove_accents( $username ); // Remove percent-encoded characters. - $username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username ); + $username = urldecode($username); // Remove HTML entities. $username = preg_replace( '/&.+?;/', '', $username ); + // If mixing different scripts, remove all but ASCII. + if ( !uses_single_unicode_script($username) ) { + $username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username ); + } + // If strict, reduce to ASCII for max portability. if ( $strict ) { $username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username ); diff --git a/tests/phpunit/tests/formatting/sanitizeUser.php b/tests/phpunit/tests/formatting/sanitizeUser.php index 1f1cadd88424b..d74c801bdfb38 100644 --- a/tests/phpunit/tests/formatting/sanitizeUser.php +++ b/tests/phpunit/tests/formatting/sanitizeUser.php @@ -35,10 +35,30 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() { } public function test_strips_percent_encoded_octets() { - $expected = is_multisite() ? 'franois' : 'Franois'; + $expected = is_multisite() ? 'françois' : 'François'; $this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) ); } public function test_optional_strict_mode_reduces_to_safe_ascii_subset() { $this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) ); } + + public function test_accepts_all_arabic() { + $expected = 'آرنت'; + $encoded = '%D8%A2%D8%B1%D9%86%D8%AA'; + + $this->assertSame( $expected, sanitize_user( $expected ) ); + $this->assertSame( $expected, sanitize_user( $encoded ) ); + } + + public function test_accepts_west_african_latin() { + $expected = 'tɔnatɔn'; + $encoded = 't%C9%94nat%C9%94n'; + + $this->assertSame( $expected, sanitize_user( $expected ) ); + $this->assertSame( $expected, sanitize_user( $encoded ) ); + } + + public function test_blocks_latin_cyrillic_mixed_name() { + $this->assertSame( "arn", sanitize_user( 'arn%D1%82' ) ); + } }