From d487c24f1bc999b60dda73d04db30129a1b28920 Mon Sep 17 00:00:00 2001
From: qiang_liu <liuqiang9596@gmail.com>
Date: Fri, 26 Apr 2024 17:05:43 +0800
Subject: [PATCH] add unicode domain name and path

---
 .../src/main/java/com/twitter/twittertext/Regex.java |  9 +++++----
 .../java/com/twitter/twittertext/ExtractorTest.java  | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/com/twitter/twittertext/Regex.java b/java/src/main/java/com/twitter/twittertext/Regex.java
index f90024c69..949ddead4 100644
--- a/java/src/main/java/com/twitter/twittertext/Regex.java
+++ b/java/src/main/java/com/twitter/twittertext/Regex.java
@@ -148,7 +148,8 @@ protected Regex() {
   private static final String URL_VALID_PRECEDING_CHARS =
       "(?:[^a-z0-9@＠$#＃" + INVALID_CHARACTERS + "]|[" + DIRECTIONAL_CHARACTERS + "]|^)";
 
-  private static final String URL_VALID_CHARS = "[a-z0-9" + LATIN_ACCENTS_CHARS + "]";
+  private static final String URL_UNICODE_CHARS ="\\p{M}\\p{L}";
+  private static final String URL_VALID_CHARS = "[a-z0-9" + URL_UNICODE_CHARS +LATIN_ACCENTS_CHARS + "]";
   private static final String URL_VALID_SUBDOMAIN =
       "(?>(?:" + URL_VALID_CHARS + "[" + URL_VALID_CHARS + "\\-_]*)?" + URL_VALID_CHARS + "\\.)";
   private static final String URL_VALID_DOMAIN_NAME =
@@ -159,7 +160,7 @@ protected Regex() {
   // Any non-space, non-punctuation characters.
   // \p{Z} = any kind of whitespace or invisible separator.
   private static final String URL_VALID_UNICODE_CHARS =
-      "[^" + PUNCTUATION_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]";
+      "[^" + PUNCTUATION_CHARS+ URL_UNICODE_CHARS + "\\s\\p{Z}\\p{InGeneralPunctuation}]";
   private static final String URL_VALID_UNICODE_DOMAIN_NAME =
       "(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" +
           URL_VALID_UNICODE_CHARS + "\\.)";
@@ -188,7 +189,7 @@ protected Regex() {
 
   private static final String URL_VALID_GENERAL_PATH_CHARS =
       "[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" +
-          LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]";
+          LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]";
 
   /**
    * Allow URL paths to contain up to two nested levels of balanced parens
@@ -216,7 +217,7 @@ protected Regex() {
    *   2. Allow =&# for empty URL parameters and other URL-join artifacts
    */
   private static final String URL_VALID_PATH_ENDING_CHARS =
-      "[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" +
+      "[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + URL_UNICODE_CHARS + "]|(?:" +
           URL_BALANCED_PARENS + ")";
 
   private static final String URL_VALID_PATH = "(?:" +
diff --git a/java/src/test/java/com/twitter/twittertext/ExtractorTest.java b/java/src/test/java/com/twitter/twittertext/ExtractorTest.java
index 7fcc5b3fd..29cb44f15 100644
--- a/java/src/test/java/com/twitter/twittertext/ExtractorTest.java
+++ b/java/src/test/java/com/twitter/twittertext/ExtractorTest.java
@@ -377,6 +377,18 @@ public void testUrlWithSpecialCCTLDWithoutProtocol() {
     assertTrue("Should not extract URLs w/o protocol", extractor.extractURLs(text).isEmpty());
   }
 
+
+  public void testUrlWithUnicode() {
+    final String text = "http://www.詹姆斯.com http://www.詹姆斯.com/詹姆斯";
+    assertList("Failed to extract URLs with unicode",
+        new String[]{"http://www.詹姆斯.com", "http://www.詹姆斯.com/詹姆斯"},
+        extractor.extractURLs(text));
+
+    final String text1 = "https://简体中文.winshipway.com/good/";
+    assertList("Failed to extract URLs with unicode",
+        new String[]{"https://简体中文.winshipway.com/good/"},
+        extractor.extractURLs(text1));
+  }
   /**
    * Helper method for asserting that the List of extracted Strings match the expected values.
    *