From d487c24f1bc999b60dda73d04db30129a1b28920 Mon Sep 17 00:00:00 2001 From: qiang_liu Date: Fri, 26 Apr 2024 17:05:43 +0800 Subject: [PATCH] add unicode domain name and path --- .../src/main/java/com/twitter/twittertext/Regex.java | 9 +++++---- .../java/com/twitter/twittertext/ExtractorTest.java | 12 ++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/com/twitter/twittertext/Regex.java b/java/src/main/java/com/twitter/twittertext/Regex.java index f90024c69..949ddead4 100644 --- a/java/src/main/java/com/twitter/twittertext/Regex.java +++ b/java/src/main/java/com/twitter/twittertext/Regex.java @@ -148,7 +148,8 @@ protected Regex() { private static final String URL_VALID_PRECEDING_CHARS = "(?:[^a-z0-9@@$##" + INVALID_CHARACTERS + "]|[" + DIRECTIONAL_CHARACTERS + "]|^)"; - private static final String URL_VALID_CHARS = "[a-z0-9" + LATIN_ACCENTS_CHARS + "]"; + private static final String URL_UNICODE_CHARS ="\\p{M}\\p{L}"; + private static final String URL_VALID_CHARS = "[a-z0-9" + URL_UNICODE_CHARS +LATIN_ACCENTS_CHARS + "]"; private static final String URL_VALID_SUBDOMAIN = "(?>(?:" + URL_VALID_CHARS + "[" + URL_VALID_CHARS + "\\-_]*)?" + URL_VALID_CHARS + "\\.)"; private static final String URL_VALID_DOMAIN_NAME = @@ -159,7 +160,7 @@ protected Regex() { // Any non-space, non-punctuation characters. // \p{Z} = any kind of whitespace or invisible separator. private static final String URL_VALID_UNICODE_CHARS = - "[^" + PUNCTUATION_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]"; + "[^" + PUNCTUATION_CHARS+ URL_UNICODE_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]"; private static final String URL_VALID_UNICODE_DOMAIN_NAME = "(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" + URL_VALID_UNICODE_CHARS + "\\.)"; @@ -188,7 +189,7 @@ protected Regex() { private static final String URL_VALID_GENERAL_PATH_CHARS = "[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" + - LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]"; + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]"; /** * Allow URL paths to contain up to two nested levels of balanced parens @@ -216,7 +217,7 @@ protected Regex() { * 2. Allow =&# for empty URL parameters and other URL-join artifacts */ private static final String URL_VALID_PATH_ENDING_CHARS = - "[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" + + "[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]|(?:" + URL_BALANCED_PARENS + ")"; private static final String URL_VALID_PATH = "(?:" + diff --git a/java/src/test/java/com/twitter/twittertext/ExtractorTest.java b/java/src/test/java/com/twitter/twittertext/ExtractorTest.java index 7fcc5b3fd..29cb44f15 100644 --- a/java/src/test/java/com/twitter/twittertext/ExtractorTest.java +++ b/java/src/test/java/com/twitter/twittertext/ExtractorTest.java @@ -377,6 +377,18 @@ public void testUrlWithSpecialCCTLDWithoutProtocol() { assertTrue("Should not extract URLs w/o protocol", extractor.extractURLs(text).isEmpty()); } + + public void testUrlWithUnicode() { + final String text = "http://www.詹姆斯.com http://www.詹姆斯.com/詹姆斯"; + assertList("Failed to extract URLs with unicode", + new String[]{"http://www.詹姆斯.com", "http://www.詹姆斯.com/詹姆斯"}, + extractor.extractURLs(text)); + + final String text1 = "https://简体中文.winshipway.com/good/"; + assertList("Failed to extract URLs with unicode", + new String[]{"https://简体中文.winshipway.com/good/"}, + extractor.extractURLs(text1)); + } /** * Helper method for asserting that the List of extracted Strings match the expected values. *