From 309d965228d2811565d76559d01d6dec4a86b8f2 Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Mon, 21 Oct 2024 22:04:42 -0400
Subject: [PATCH 1/2] Separate Thai words.

---
 requirements.txt                                |  1 +
 scriptshifter/hooks/asian_tokenizer/__init__.py |  8 ++++++++
 scriptshifter/tables/data/thai.yml              |  3 +++
 scriptshifter/tables/data/thai_alt.yml          |  5 +++++
 scriptshifter/trans.py                          | 15 ++++++++++-----
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 scriptshifter/hooks/asian_tokenizer/__init__.py

diff --git a/requirements.txt b/requirements.txt
index 4e85565..f5675ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 # Core application dependencies.
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2
diff --git a/scriptshifter/hooks/asian_tokenizer/__init__.py b/scriptshifter/hooks/asian_tokenizer/__init__.py
new file mode 100644
index 0000000..1b396d5
--- /dev/null
+++ b/scriptshifter/hooks/asian_tokenizer/__init__.py
@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])
diff --git a/scriptshifter/tables/data/thai.yml b/scriptshifter/tables/data/thai.yml
index 10b80f8..46f6181 100644
--- a/scriptshifter/tables/data/thai.yml
+++ b/scriptshifter/tables/data/thai.yml
@@ -33,6 +33,9 @@ options:
 script_to_roman:
   hooks:
     post_config:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
       -
         - aksharamukha.romanizer.s2r_post_config
         - src_script: "Thai"
diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml
index 72ac054..210e4d6 100644
--- a/scriptshifter/tables/data/thai_alt.yml
+++ b/scriptshifter/tables/data/thai_alt.yml
@@ -4,6 +4,11 @@ general:
   case_sensitive: false
 
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
   map:
     # COMMON SPECIAL CHARACTERS
 
diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
index 7d68601..8f5a39e 100644
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         if _run_hook("post_config", ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
-
-        if _run_hook("post_normalize", ctx) == BREAK:
+        # _normalize_src returns the results of the post_normalize hook.
+        if _normalize_src(
+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
+        logger.debug(f"Normalized source: {ctx.src}")
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
 
         # Loop through source characters. The increment of each loop depends on
@@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             # token or exit the scanning loop altogether.
             hret = _run_hook("begin_input_token", ctx)
             if hret == BREAK:
-                logger.debug("Breaking text scanning from hook signal.")
+                Logger.debug("Breaking text scanning from hook signal.")
                 break
             if hret == CONT:
                 logger.debug("Skipping scanning iteration from hook signal.")
@@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def _normalize_src(ctx, norm_rules):
     """
     Normalize source text according to rules.
+
+    NOTE: this manipluates the protected source attribute so it may not
+    correspond to the originally provided source.
     """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
-    logger.debug(f"Normalized source: {ctx.src}")
+
+    return _run_hook("post_normalize", ctx)
 
 
 def _is_bow(cur, ctx, word_boundary):

From 237f1f899406466472271c589ca39d9c03d48cf9 Mon Sep 17 00:00:00 2001
From: scossu <stefano@cossu.cc>
Date: Mon, 21 Oct 2024 22:11:31 -0400
Subject: [PATCH 2/2] Use model name shorthand.

---
 scriptshifter/tables/data/thai_alt.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml
index 210e4d6..ec99df4 100644
--- a/scriptshifter/tables/data/thai_alt.yml
+++ b/scriptshifter/tables/data/thai_alt.yml
@@ -8,7 +8,7 @@ script_to_roman:
     post_normalize:
       -
         - asian_tokenizer.s2r_tokenize
-        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
+        - model: "th"
   map:
     # COMMON SPECIAL CHARACTERS