From 309d965228d2811565d76559d01d6dec4a86b8f2 Mon Sep 17 00:00:00 2001 From: scossu Date: Mon, 21 Oct 2024 22:04:42 -0400 Subject: [PATCH 1/2] Separate Thai words. --- requirements.txt | 1 + scriptshifter/hooks/asian_tokenizer/__init__.py | 8 ++++++++ scriptshifter/tables/data/thai.yml | 3 +++ scriptshifter/tables/data/thai_alt.yml | 5 +++++ scriptshifter/trans.py | 15 ++++++++++----- 5 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 scriptshifter/hooks/asian_tokenizer/__init__.py diff --git a/requirements.txt b/requirements.txt index 4e85565..f5675ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ # Core application dependencies. aksharamukha>=2.2,<3 +esupar>=1.7.5 flask>=2.3,<3 flask-cors>=4.0,<5 python-dotenv>=1.0,<2 diff --git a/scriptshifter/hooks/asian_tokenizer/__init__.py b/scriptshifter/hooks/asian_tokenizer/__init__.py new file mode 100644 index 0000000..1b396d5 --- /dev/null +++ b/scriptshifter/hooks/asian_tokenizer/__init__.py @@ -0,0 +1,8 @@ +from esupar import load + + +def s2r_tokenize(ctx, model): + nlp = load(model) + token_data = nlp(ctx.src) + + ctx._src = " ".join(token_data.values[1]) diff --git a/scriptshifter/tables/data/thai.yml b/scriptshifter/tables/data/thai.yml index 10b80f8..46f6181 100644 --- a/scriptshifter/tables/data/thai.yml +++ b/scriptshifter/tables/data/thai.yml @@ -33,6 +33,9 @@ options: script_to_roman: hooks: post_config: + - + - asian_tokenizer.s2r_tokenize + - model: "KoichiYasuoka/roberta-base-thai-spm-upos" - - aksharamukha.romanizer.s2r_post_config - src_script: "Thai" diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml index 72ac054..210e4d6 100644 --- a/scriptshifter/tables/data/thai_alt.yml +++ b/scriptshifter/tables/data/thai_alt.yml @@ -4,6 +4,11 @@ general: case_sensitive: false script_to_roman: + hooks: + post_normalize: + - + - asian_tokenizer.s2r_tokenize + - model: "KoichiYasuoka/roberta-base-thai-spm-upos" map: # COMMON SPECIAL CHARACTERS diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py index 7d68601..8f5a39e 100644 --- a/scriptshifter/trans.py +++ b/scriptshifter/trans.py @@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): if _run_hook("post_config", ctx) == BREAK: return getattr(ctx, "dest", ""), ctx.warnings - _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) - - if _run_hook("post_normalize", ctx) == BREAK: + # _normalize_src returns the results of the post_normalize hook. + if _normalize_src( + ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK: return getattr(ctx, "dest", ""), ctx.warnings + logger.debug(f"Normalized source: {ctx.src}") lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir)) # Loop through source characters. The increment of each loop depends on @@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): # token or exit the scanning loop altogether. hret = _run_hook("begin_input_token", ctx) if hret == BREAK: - logger.debug("Breaking text scanning from hook signal.") + Logger.debug("Breaking text scanning from hook signal.") break if hret == CONT: logger.debug("Skipping scanning iteration from hook signal.") @@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): def _normalize_src(ctx, norm_rules): """ Normalize source text according to rules. + + NOTE: this manipluates the protected source attribute so it may not + correspond to the originally provided source. """ for nk, nv in norm_rules.items(): ctx._src = ctx.src.replace(nk, nv) - logger.debug(f"Normalized source: {ctx.src}") + + return _run_hook("post_normalize", ctx) def _is_bow(cur, ctx, word_boundary): From 237f1f899406466472271c589ca39d9c03d48cf9 Mon Sep 17 00:00:00 2001 From: scossu Date: Mon, 21 Oct 2024 22:11:31 -0400 Subject: [PATCH 2/2] Use model name shorthand. --- scriptshifter/tables/data/thai_alt.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml index 210e4d6..ec99df4 100644 --- a/scriptshifter/tables/data/thai_alt.yml +++ b/scriptshifter/tables/data/thai_alt.yml @@ -8,7 +8,7 @@ script_to_roman: post_normalize: - - asian_tokenizer.s2r_tokenize - - model: "KoichiYasuoka/roberta-base-thai-spm-upos" + - model: "th" map: # COMMON SPECIAL CHARACTERS