Skip to content

Commit

Permalink
General: Add support for most unicode letters in WP backend login names
Browse files Browse the repository at this point in the history
This accepts user names that contain a single script, but not mixed-script
names, such as ones that mix Latin and Cyrillic. That seemed to be closest
to the code's existing philosophy.
  • Loading branch information
arnt committed Sep 23, 2024
1 parent c8587f1 commit 75d628b
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 2 deletions.
114 changes: 113 additions & 1 deletion src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -2123,6 +2123,113 @@ function sanitize_file_name( $filename ) {
return apply_filters( 'sanitize_file_name', $filename, $filename_raw );
}

/**
* Returns true if the string contains no more than one unicode
* script, and false if it contains two or more. This only considers
* alphabetic characters.
*
* This returns true for an empty string.
*
* IntlChar does not support returning the script property defined by
* https://www.unicode.org/reports/tr24/, so this implementation uses
* a workaround. Some of the old scripts have several code blocks, but
* the scripts currently being added have only one, since the

Check failure on line 2137 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Tabs must be used to indent lines; spaces are not allowed
* committee has grown better at estimating the necessary size.

Check failure on line 2138 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Tabs must be used to indent lines; spaces are not allowed

Check failure on line 2138 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces after opening parenthesis; 0 found

Check failure on line 2138 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 spaces before closing parenthesis; 0 found

Check failure on line 2138 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / Check PHP compatibility / Run compatibility checks

The function mb_str_split() is not present in PHP version 7.3 or earlier

Check failure on line 2139 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Line indented incorrectly; expected 2 tabs, found 1

Check failure on line 2139 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Space after opening control structure is required

Check failure on line 2139 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

No space before opening parenthesis is prohibited

Check failure on line 2139 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

No space after opening parenthesis is prohibited

Check failure on line 2139 in src/wp-includes/formatting.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

Expected 1 space after IF keyword; 0 found
* This maps the known extension blocks ("latin extended a" etc) to
* the first block for that script, and then checks that the string
* uses only a single block. This works for the scripts currently in
* Unicode, and will work for future scripts as long as the committee
* keeps estimating correctly.
*/

function uses_single_unicode_script( $input ) {
$block = 0;
foreach ( mb_str_split($input) as $cp ) {
if(IntlChar::isalpha($cp)) {
$b = IntlChar::getBlockCode($cp);
switch($b) {
case IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT:
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_A:
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_B:
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_C:
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_D:
case IntlChar::BLOCK_CODE_IPA_EXTENSIONS: // used in Ghana etc
case IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL:
$b = IntlChar::BLOCK_CODE_BASIC_LATIN;
break;
case IntlChar::BLOCK_CODE_GREEK_EXTENDED:
case IntlChar::BLOCK_CODE_COPTIC:
case IntlChar::BLOCK_CODE_COPTIC_EPACT_NUMBERS:
// Greek and coptic overlap. Coptic looks like Greek
// upper case, so readers of Greek can read Coptic,
// but readers of Coptic can't necessarily read
// Greek. This led to an unfortunate situation in
// Unicode, where the two can't be properly
// distinguished by block. However, because of the
// overlap, this isn't really a problem.
$b = IntlChar::BLOCK_CODE_GREEK;
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED:
case IntlChar::BLOCK_CODE_ETHIOPIC_EXTENDED_A:
case IntlChar::BLOCK_CODE_ETHIOPIC_SUPPLEMENT:
$b = IntlChar::BLOCK_CODE_ETHIOPIC;
break;
case IntlChar::BLOCK_CODE_ARABIC_EXTENDED_A:
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_A:
case IntlChar::BLOCK_CODE_ARABIC_PRESENTATION_FORMS_B:
case IntlChar::BLOCK_CODE_ARABIC_SUPPLEMENT:
$b = IntlChar::BLOCK_CODE_ARABIC;
break;
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_A:
case IntlChar::BLOCK_CODE_CYRILLIC_EXTENDED_B:
$b = IntlChar::BLOCK_CODE_CYRILLIC;
break;
case IntlChar::BLOCK_CODE_BOPOMOFO_EXTENDED:
$b = IntlChar::BLOCK_CODE_BOPOMOFO;
break;
case IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED:
$b = IntlChar::BLOCK_CODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS;
break;
case IntlChar::BLOCK_CODE_DEVANAGARI_EXTENDED:
$b = IntlChar::BLOCK_CODE_DEVANAGARI;
break;
case IntlChar::BLOCK_CODE_HANGUL_JAMO:
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_A:
case IntlChar::BLOCK_CODE_HANGUL_JAMO_EXTENDED_B:
$b = IntlChar::BLOCK_CODE_HANGUL;
break;
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_A:
case IntlChar::BLOCK_CODE_MYANMAR_EXTENDED_B:
$b = IntlChar::BLOCK_CODE_MYANMAR;
break;
case IntlChar::BLOCK_CODE_CJK_STROKES:
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS:
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C:
case IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D:
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS:
case IntlChar::BLOCK_CODE_CJK_RADICALS_SUPPLEMENT:
case IntlChar::BLOCK_CODE_ENCLOSED_CJK_LETTERS_AND_MONTHS:
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_FORMS:
case IntlChar::BLOCK_CODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT:
$b = IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS;
break;
}
if($block == 0) {
$block = $b;
}
if($block != $b) {
return false;
}
}
}
return true;
}

/**
* Sanitizes a username, stripping out unsafe characters.
*
Expand All @@ -2143,10 +2250,15 @@ function sanitize_user( $username, $strict = false ) {
$username = wp_strip_all_tags( $username );
$username = remove_accents( $username );
// Remove percent-encoded characters.
$username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username );
$username = urldecode($username);
// Remove HTML entities.
$username = preg_replace( '/&.+?;/', '', $username );

// If mixing different scripts, remove all but ASCII.
if ( !uses_single_unicode_script($username) ) {
$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
}

// If strict, reduce to ASCII for max portability.
if ( $strict ) {
$username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username );
Expand Down
22 changes: 21 additions & 1 deletion tests/phpunit/tests/formatting/sanitizeUser.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,30 @@ public function test_strips_encoded_ampersand_when_followed_by_semicolon() {
}

public function test_strips_percent_encoded_octets() {
$expected = is_multisite() ? 'franois' : 'Franois';
$expected = is_multisite() ? 'françois' : 'François';
$this->assertSame( $expected, sanitize_user( 'Fran%c3%a7ois' ) );
}
public function test_optional_strict_mode_reduces_to_safe_ascii_subset() {
$this->assertSame( 'abc', sanitize_user( '()~ab~ˆcˆ!', true ) );
}

public function test_accepts_all_arabic() {
$expected = 'آرنت';
$encoded = '%D8%A2%D8%B1%D9%86%D8%AA';

$this->assertSame( $expected, sanitize_user( $expected ) );
$this->assertSame( $expected, sanitize_user( $encoded ) );
}

public function test_accepts_west_african_latin() {
$expected = 'tɔnatɔn';
$encoded = 't%C9%94nat%C9%94n';

$this->assertSame( $expected, sanitize_user( $expected ) );
$this->assertSame( $expected, sanitize_user( $encoded ) );
}

public function test_blocks_latin_cyrillic_mixed_name() {
$this->assertSame( "arn", sanitize_user( 'arn%D1%82' ) );

Check failure on line 62 in tests/phpunit/tests/formatting/sanitizeUser.php

View workflow job for this annotation

GitHub Actions / PHP coding standards / Run coding standards checks

String "arn" does not require double quotes; use single quotes instead
}
}

0 comments on commit 75d628b

Please sign in to comment.