diff --git a/.gitignore b/.gitignore index 6543906..5cf11b6 100644 --- a/.gitignore +++ b/.gitignore @@ -137,5 +137,6 @@ tags.temp # Local ext/arabic_rom/data +scriptshifter/data/*.db !.keep VERSION diff --git a/Dockerfile b/Dockerfile index fc6eace..8b2e852 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,8 +3,9 @@ ARG WORKROOT "/usr/local/scriptshifter/src" # Copy core application files. WORKDIR ${WORKROOT} -COPY entrypoint.sh uwsgi.ini wsgi.py VERSION ./ +COPY VERSION entrypoint.sh sscli uwsgi.ini wsgi.py ./ COPY scriptshifter ./scriptshifter/ +COPY tests ./tests/ COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt diff --git a/README.md b/README.md index 23c82dd..d4f89ea 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,21 @@ Currently, the following environment variables are defined: - `TXL_DICTA_EP`: Endpoint for the Dicta Hebrew transliteration service. This is mandatory for using the Hebrew module. +## Initial setup + +In order to run Scriptshifter, a local SQLite database must be created. The +simplest way to do that is via command-line: + +```bash +./sscli admin init-db +``` + +This step is already included in the `entrypoint.sh` script that gets executed +by Docker, so no additional action is necessary. + +Note that the DB must be recreated every time any of the configuration tables +in `scriptshifter/tables/data` changes. + ## Local development server For local development, it is easiest to run Flask without the WSGI wrapper, @@ -73,11 +88,12 @@ string in a production environment. `TXL_LOGLEVEL`: Logging level. Use Python notation. The default is `WARN`. -`TXL_SMTP_HOST`: SMTP host to send feedback messages through. Defaults to -`localhost`. +`TXL_SMTP_HOST`: SMTP host to send feedback messages through. If not defined, +the feedback form will not be shown in the UI. `TXL_SMTP_PORT`: Port of the SMTP server. Defaults to `1025`. + ## Web UI `/` renders a simple HTML form to test the transliteration service. @@ -88,6 +104,25 @@ the drop-down automatically. The value must be one of the keys found in `/languages`. +## Command-line interface + +Various Scriptshifter commands can be accessed via the shell command `sscli`. +At the moment only a few essential admin and testing tools are available. More +commands can be made avaliable on an as-needed basis. + +Help menu: + +``` +/path/to/sscli --help +``` + +Section help: + +``` +/path/to/sscli admin --help +``` + + ## Contributing See the [contributing guide](./doc/contributing.md). diff --git a/entrypoint.sh b/entrypoint.sh index 7ef24c3..ccecb7f 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -12,6 +12,8 @@ fi host=${TXL_WEBAPP_HOST:-"0.0.0.0"} port=${TXL_WEBAPP_PORT:-"8000"} +./sscli admin init-db + if [ "${FLASK_ENV}" == "development" ]; then exec flask run -h $host -p $port else diff --git a/scriptshifter/__init__.py b/scriptshifter/__init__.py index ea34ebd..ea5aef5 100644 --- a/scriptshifter/__init__.py +++ b/scriptshifter/__init__.py @@ -9,6 +9,14 @@ APP_ROOT = path.dirname(path.realpath(__file__)) +""" +SQLite database path. + +This DB stores all the runtime transliteration data. +""" +DB_PATH = environ.get( + "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db")) + """ SMTP server for sending email. For a dummy server that just echoes the messages, run: `python -m smtpd -n -c DebuggingServer localhost:1025` diff --git a/scriptshifter/data/.keep b/scriptshifter/data/.keep new file mode 100644 index 0000000..e69de29 diff --git a/scriptshifter/hooks/greek/__init__.py b/scriptshifter/hooks/greek/__init__.py index f098375..c71291a 100644 --- a/scriptshifter/hooks/greek/__init__.py +++ b/scriptshifter/hooks/greek/__init__.py @@ -6,9 +6,9 @@ from scriptshifter.exceptions import CONT -# Suffixed by ʹ # Indices are positions in the numeric string from the right DIGITS = { + # Suffixed by ʹ (U+0374) 1: { # Units "α": 1, "β": 2, @@ -45,7 +45,7 @@ "ω": 8, "ϡ": 9, }, - # Prefixed by ͵ + # Prefixed by ͵ (U+0375) 4: { "α": 1, "β": 2, diff --git a/scriptshifter/rest_api.py b/scriptshifter/rest_api.py index 0d77d11..ad65419 100644 --- a/scriptshifter/rest_api.py +++ b/scriptshifter/rest_api.py @@ -1,7 +1,6 @@ import logging from base64 import b64encode -from copy import deepcopy from email.message import EmailMessage from json import dumps from os import environ, urandom @@ -15,7 +14,7 @@ GIT_COMMIT, GIT_TAG, SMTP_HOST, SMTP_PORT) from scriptshifter.exceptions import ApiError -from scriptshifter.tables import list_tables, load_table +from scriptshifter.tables import list_tables, get_language from scriptshifter.trans import transliterate @@ -89,16 +88,9 @@ def list_languages(): @app.route("/table/") def dump_table(lang): """ - Dump parsed transliteration table for a language. + Dump a language configuration from the DB. """ - tbl = deepcopy(load_table(lang)) - for sec_name in ("roman_to_script", "script_to_roman"): - if sec_name in tbl: - for hname, fn_defs in tbl[sec_name].get("hooks", {}).items(): - tbl[sec_name]["hooks"][hname] = [ - (fn.__name__, kw) for (fn, kw) in fn_defs] - - return jsonify(tbl) + return get_language(lang) @app.route("/options/", methods=["GET"]) @@ -106,7 +98,7 @@ def get_options(lang): """ Get extra options for a table. """ - tbl = load_table(lang) + tbl = get_language(lang) return jsonify(tbl.get("options", [])) diff --git a/scriptshifter/tables/__init__.py b/scriptshifter/tables/__init__.py index 02e4ab4..7df1958 100644 --- a/scriptshifter/tables/__init__.py +++ b/scriptshifter/tables/__init__.py @@ -1,9 +1,13 @@ import logging import re +import sqlite3 +from collections import defaultdict from functools import cache from importlib import import_module -from os import environ, path, access, R_OK +from json import dumps as jdumps, loads as jloads +from os import R_OK, access, environ, makedirs, path, unlink +from shutil import move from yaml import load try: @@ -11,17 +15,22 @@ except ImportError: from yaml import Loader +from scriptshifter import DB_PATH from scriptshifter.exceptions import BREAK, ConfigError __doc__ = """ Transliteration tables. -These tables contain all transliteration information, grouped by script and -language (or language and script? TBD) +These tables contain all transliteration information. The static YML files are +transformed and loaded into a database, which is the effective data source at +runtime. """ +TMP_DB_PATH = path.join( + path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH)) + DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data") # Can be overridden for tests. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR) @@ -52,6 +61,11 @@ BOW = 1 << 1 EOW = 1 << 0 +# Feature flags used in database tables. +FEAT_S2R = 1 << 0 # Has S2R. +FEAT_R2S = 1 << 1 # Has R2S. +FEAT_CASEI = 1 << 2 # Case-insensitive script. +FEAT_RE = 1 << 3 # Regular expression. logger = logging.getLogger(__name__) @@ -123,6 +137,158 @@ def __hash__(self): return hash(self.content) +def init_db(): + """ + Populate database with language data. + + This operation removes any preexisting database. + + All tables in the index file (`./data/index.yml`) will be parsed + (including inheritance rules) and loaded into the designated DB. + + This must be done only once at bootstrap. To update individual tables, + see populate_table(), which this function calls iteratively. + """ + # Create parent diretories if necessary. + # If the DB already exists, it will be overwritten ONLY on success at + # hhis point. + if path.isfile(TMP_DB_PATH): + # Remove previous temp file (possibly from failed attempt) + unlink(TMP_DB_PATH) + else: + makedirs(path.dirname(TMP_DB_PATH), exist_ok=True) + + conn = sqlite3.connect(TMP_DB_PATH) + + # Initialize schema. + with open(path.join(path.dirname(DEFAULT_TABLE_DIR), "init.sql")) as fh: + with conn: + conn.executescript(fh.read()) + + # Populate tables. + with open(path.join(TABLE_DIR, "index.yml")) as fh: + tlist = load(fh, Loader=Loader) + try: + with conn: + for tname, tdata in tlist.items(): + res = conn.execute( + """INSERT INTO tbl_language ( + name, label, marc_code, description + ) VALUES (?, ?, ?, ?)""", + ( + tname, tdata.get("name"), tdata.get("marc_code"), + tdata.get("description"), + ) + ) + populate_table(conn, res.lastrowid, tname) + + # If the DB already exists, it will be overwritten ONLY on success at + # thhis point. + move(TMP_DB_PATH, DB_PATH) + finally: + conn.close() + if path.isfile(TMP_DB_PATH): + # Remove leftover temp files from bungled up operation. + unlink(TMP_DB_PATH) + + +def get_connection(): + """ + Get the default DB connection object. + + To be closed by the caller or used as a context. + """ + return sqlite3.connect(DB_PATH) + + +def populate_table(conn, tid, tname): + data = load_table(tname) + flags = 0 + if "script_to_roman" in data: + flags |= FEAT_S2R + if "roman_to_script" in data: + flags |= FEAT_R2S + + conn.execute( + "UPDATE tbl_language SET features = ? WHERE id = ?", + (flags, tid)) + + for t_dir in (FEAT_S2R, FEAT_R2S): + # BEGIN per-section loop. + + sec_name = ( + "script_to_roman" if t_dir == FEAT_S2R else "roman_to_script") + sec = data.get(sec_name) + if not sec: + continue + + # Transliteration map. + sort = 1 + for k, v in sec.get("map", {}): + conn.execute( + """INSERT INTO tbl_trans_map ( + lang_id, dir, src, dest, sort + ) VALUES (?, ?, ?, ?, ?)""", + (tid, t_dir, k, v, sort)) + sort += 1 + + # hooks. + for k, v in sec.get("hooks", {}).items(): + for i, hook_data in enumerate(v, start=1): + conn.execute( + """INSERT INTO tbl_hook ( + lang_id, dir, name, sort, module, fn, kwargs + ) VALUES (?, ?, ?, ?, ?, ?, ?)""", + ( + tid, t_dir, k, i, hook_data[0], + hook_data[1].__name__, jdumps(hook_data[2]))) + + # Ignore rules (R2S only). + for row in sec.get("ignore", []): + if isinstance(row, dict): + if "re" in row: + flags = FEAT_RE + rule = row["re"] + else: + flags = 0 + rule = row + + conn.execute( + """INSERT INTO tbl_ignore ( + lang_id, rule, features + ) VALUES (?, ?, ?)""", + (tid, rule, flags)) + + # Double caps (S2R only). + for rule in sec.get("double_cap", []): + conn.execute( + """INSERT INTO tbl_double_cap ( + lang_id, rule + ) VALUES (?, ?)""", + (tid, rule)) + + # Normalize (S2R only). + for src, dest in sec.get("normalize", {}).items(): + conn.execute( + """INSERT INTO tbl_normalize (lang_id, src, dest) + VALUES (?, ?, ?)""", + (tid, src, dest)) + + # END per-section loop. + + # UI options + for opt in data.get("options", []): + conn.execute( + """INSERT INTO tbl_option ( + lang_id, name, label, description, dtype, + options, default_v + ) VALUES (?, ?, ?, ?, ?, ?, ?)""", + ( + tid, opt["id"], opt["label"], opt["description"], + opt["type"], jdumps(opt.get("options")), + opt["default"])) + + @cache def list_tables(): """ @@ -131,16 +297,29 @@ def list_tables(): Note that this may not correspond to all the table files in the data folder, but only those exposed in the index. """ - with open(path.join(TABLE_DIR, "index.yml")) as fh: - tdata = load(fh, Loader=Loader) + conn = get_connection() + + with conn: + data = conn.execute( + """SELECT name, label, features, marc_code, description + FROM tbl_language""") + tdata = { + row[0]: { + "label": row[1], + "has_s2r": bool(row[2] & FEAT_S2R), + "has_r2s": bool(row[2] & FEAT_R2S), + "case_sensitive": not (row[2] & FEAT_CASEI), + "marc_code": row[3], + "description": row[4], + } for row in data + } return tdata -@cache def load_table(tname): """ - Load one transliteration table and possible parents. + Parse one transliteration table and possible parents from YML files. The table file is parsed into an in-memory configuration that contains the language & script metadata and parsing rules. @@ -304,6 +483,176 @@ def load_hook_fn(cname, sec): f"Hook function {fnname} defined in {cname} configuration " f"not found in module {HOOK_PKG_PATH}.{modname}!" ) - hook_fn[cfg_hook].append((fn, fn_kwargs)) + hook_fn[cfg_hook].append((modname, fn, fn_kwargs)) return hook_fn + + +@cache +def get_language(lang): + """ Get all language options from the DB. """ + + conn = get_connection() + + with conn: + general = get_lang_general(conn, lang) + lang_id = general["id"] + data = general["data"] + + # Normalization. + + norm_data = get_lang_normalize(conn, lang_id) + if len(norm_data): + data["normalize"] = norm_data + + # Script to Roman map and hooks. + + if data["has_s2r"]: + data["script_to_roman"] = {} + s2r_map = tuple( + row for row in get_lang_map(conn, lang_id, FEAT_S2R)) + if len(s2r_map): + data["script_to_roman"]["map"] = s2r_map + + s2r_hooks = get_lang_hooks(conn, lang_id, FEAT_S2R) + if len(s2r_hooks): + data["script_to_roman"]["hooks"] = s2r_hooks + + # Roman to script map, ignore list, and hooks. + + if data["has_r2s"]: + data["roman_to_script"] = {} + r2s_map = tuple( + row for row in get_lang_map(conn, lang_id, FEAT_R2S)) + if len(r2s_map): + data["roman_to_script"]["map"] = r2s_map + + r2s_ignore = get_lang_ignore(conn, lang_id) + if len(r2s_ignore): + data["roman_to_script"]["ignore"] = r2s_ignore + + r2s_hooks = get_lang_hooks(conn, lang_id, FEAT_R2S) + if len(r2s_hooks): + data["roman_to_script"]["hooks"] = r2s_hooks + + opt_data = get_lang_options(conn, lang_id) + if len(opt_data): + data["options"] = opt_data + + double_cap = get_lang_dcap(conn, lang_id) + if len(double_cap): + data["double_cap"] = double_cap + + conn.close() + + return data + + +def get_lang_general(conn, lang): + """ Language general attributes. """ + lang_q = conn.execute( + """SELECT id, name, label, features, marc_code, description + FROM tbl_language WHERE name = ?""", (lang,)) + lang_data = lang_q.fetchone() + + return { + "id": lang_data[0], + "data": { + "name": lang_data[1], + "label": lang_data[2], + "has_s2r": bool(lang_data[3] & FEAT_S2R), + "has_r2s": bool(lang_data[3] & FEAT_R2S), + "case_sensitive": not (lang_data[3] & FEAT_CASEI), + "marc_code": lang_data[4], + "description": lang_data[5], + }, + } + + +def get_lang_normalize(conn, lang_id): + qry = conn.execute( + """SELECT src, dest FROM tbl_normalize + WHERE lang_id = ?""", + (lang_id,)) + return {row[0]: row[1] for row in qry} + + +def get_lang_ignore(conn, lang_id): + """ + Ignore list as a tuple. + """ + qry = conn.execute( + """SELECT rule, features FROM tbl_ignore + WHERE lang_id = ?""", + (lang_id,)) + # Features (regular expressions) not implemented yet. + return tuple(row[0] for row in qry) + + +@cache +def get_lang_map(conn, lang_id, t_dir): + """ + S2R or R2S map. + + Generator of tuples (source, destination). + """ + qry = conn.execute( + """SELECT src, dest FROM tbl_trans_map + WHERE lang_id = ? AND dir = ? + ORDER BY sort ASC""", + (lang_id, t_dir)) + + for row in qry: + yield (Token(row[0]), row[1]) + + +def get_lang_options(conn, lang_id): + """ Language options as a tuple of dictionaries. """ + qry = conn.execute( + """SELECT name, label, description, dtype, options, default_v + FROM tbl_option + WHERE lang_id = ?""", + (lang_id,)) + + return tuple( + { + "id": row[0], + "label": row[1], + "description": row[2], + "type": row[3], + "options": jloads(row[4]) if row[4] else None, + "default": row[5], + } + for row in qry + ) + + +def get_lang_hooks(conn, lang_id, t_dir): + """ Language hooks in sorting order. """ + hooks = defaultdict(list) + + qry = conn.execute( + """SELECT name, module, fn, kwargs + FROM tbl_hook WHERE lang_id = ? AND dir = ? + ORDER BY name, sort""", + (lang_id, t_dir)) + + for row in qry: + hooks[row[0]].append( + { + "module_name": row[1], + "fn_name": row[2], + "kwargs": jloads(row[3]), + } + ) + + return hooks + + +def get_lang_dcap(conn, lang_id): + qry = conn.execute( + """SELECT rule + FROM tbl_double_cap WHERE lang_id = ?""", + (lang_id,)) + + return tuple(row[0] for row in qry) diff --git a/scriptshifter/tables/data/asian_cyrillic.yml b/scriptshifter/tables/data/asian_cyrillic.yml index 53c7324..b58ebec 100644 --- a/scriptshifter/tables/data/asian_cyrillic.yml +++ b/scriptshifter/tables/data/asian_cyrillic.yml @@ -391,8 +391,8 @@ roman_to_script: script_to_roman: map: - "\u00AB": """ - "\u00BB": """ + "\u00AB": "\"" + "\u00BB": "\"" "\u2116": "No\u0332" "\u0400": "E\u0300" "\u0401": "E\u0308" diff --git a/scriptshifter/tables/data/bashkir_cyrillic.yml b/scriptshifter/tables/data/bashkir_cyrillic.yml index 8d3a16f..f898425 100644 --- a/scriptshifter/tables/data/bashkir_cyrillic.yml +++ b/scriptshifter/tables/data/bashkir_cyrillic.yml @@ -23,7 +23,7 @@ roman_to_script: "U\u0307": "\u04AE" "u\u0307": "\u04AF" "TH": "\u04AA" - "Th": "\u04AA"s + "Th": "\u04AA" "th": "\u04AB" "J": "\u04B8" "j": "\u04B9" diff --git a/scriptshifter/tables/data/index.yml b/scriptshifter/tables/data/index.yml index 9dbae7d..48f98d5 100644 --- a/scriptshifter/tables/data/index.yml +++ b/scriptshifter/tables/data/index.yml @@ -47,7 +47,7 @@ bengali: bulgarian: marc_code: bul name: Bulgarian -buriat: +buriat_cyrillic: marc_code: bua name: Buriat (Cyrillic) burmese: @@ -111,7 +111,7 @@ kannada: kara-kalpak_cyrillic: marc_code: kaa name: Kara-Kalpak (Cyrillic) -karachai-balkar_cyrillic: +karachay-balkar_cyrillic: marc_code: krc name: Karachay-Balkar (Cyrillic) karelian_cyrillic: diff --git a/scriptshifter/tables/data/kara-kalpak_cyrillic.yml b/scriptshifter/tables/data/kara-kalpak_cyrillic.yml index b3a5375..109a5df 100644 --- a/scriptshifter/tables/data/kara-kalpak_cyrillic.yml +++ b/scriptshifter/tables/data/kara-kalpak_cyrillic.yml @@ -27,11 +27,11 @@ roman_to_script: script_to_roman: map: "\u040E": "W" - "\u045E"" "w" + "\u045E": "w" "\u0492": "Gh" "\u0493": "gh" "\u049A": "Q" - "\u-49B": "q" + "\u049B": "q" "\u04A2": "N\uFE20G\uFE21" "\u04A3": "n\uFE20g\uFE21" "\u04AE": "U\u0307" diff --git a/scriptshifter/tables/data/komi_cyrillic.yml b/scriptshifter/tables/data/komi_cyrillic.yml index ee74805..b9259ae 100644 --- a/scriptshifter/tables/data/komi_cyrillic.yml +++ b/scriptshifter/tables/data/komi_cyrillic.yml @@ -5,10 +5,10 @@ general: roman_to_script: map: - "D\u0320Z\u0320\H\u\0320": "\u0496" - "D\u0320Z\u0320\h\u\0320": "\u0496" - "D\u0320z\u0320\h\u\0320": "\u0496" - "d\u0320z\u0320\h\u\0320": "\u0497" + "D\u0320Z\u0320H\u0320": "\u0496" + "D\u0320Z\u0320h\u0320": "\u0496" + "D\u0320z\u0320h\u0320": "\u0496" + "d\u0320z\u0320h\u0320": "\u0497" "D\uFE20Z\uFE21": "\u0506" "D\uFE20z\uFE21": "\u0506" "d\uFE20z\uFE21": "\u0507" diff --git a/scriptshifter/tables/data/mongolian_mongol_bichig.yml b/scriptshifter/tables/data/mongolian_mongol_bichig.yml index 30a85ae..27de086 100644 --- a/scriptshifter/tables/data/mongolian_mongol_bichig.yml +++ b/scriptshifter/tables/data/mongolian_mongol_bichig.yml @@ -6,12 +6,12 @@ general: roman_to_script: map: - "\u002Daca": "\u202F\u1820\u1834\u1820 + "\u002Daca": "\u202F\u1820\u1834\u1820" "\u002DA": "\u180E\u1820" "\u002Da": "\u180E\u1820" "A": "\u1820" "a": "\u1820" - "\u002Dece": "\u202F\u1821\u1834\u1821 + "\u002Dece": "\u202F\u1821\u1834\u1821" "\u002DE": "\u180E\u1821" "\u002De": "\u180E\u1821" "\u002D": "\u202F" diff --git a/scriptshifter/tables/data/yiddish.yml b/scriptshifter/tables/data/yiddish.yml index c55c431..9539695 100644 --- a/scriptshifter/tables/data/yiddish.yml +++ b/scriptshifter/tables/data/yiddish.yml @@ -4,7 +4,7 @@ general: options: - id: loshn_koydesh label: Loshn Koydesh - description: [TODO] + description: "Apply Loshn Koydesh vocalization." type: boolean default: false diff --git a/scriptshifter/tables/init.sql b/scriptshifter/tables/init.sql new file mode 100644 index 0000000..a563d1c --- /dev/null +++ b/scriptshifter/tables/init.sql @@ -0,0 +1,107 @@ +/* + * Master language table. + * + * Overview of languages available in Scriptshifter. + */ +CREATE TABLE tbl_language ( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE, + label TEXT, + marc_code TEXT, + description TEXT, + features TINYINT DEFAULT 0 +); + +/* + * Transliteration maps. + * + * Each row is a S2R or R2S pair associated with a language ID. + */ +CREATE TABLE tbl_trans_map ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + dir TINYINT NOT NULL DEFAULT 0, /* 1 = S2R; 2 = R2S */ + src TEXT NOT NULL, + dest TEXT, + sort INT NOT NULL, /* Smaller values have higher priority. */ + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); +CREATE UNIQUE INDEX idx_trans_lookup ON tbl_trans_map (lang_id, dir, src); +CREATE INDEX idx_trans_map_sort ON tbl_trans_map (sort ASC); + +/* + * Processing hooks. + * + * Note that multiple functions may be grouped under the same hook, lang, and + * direction. These are ordered by `sort`. + */ +CREATE TABLE tbl_hook ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + dir TINYINT NOT NULL DEFAULT 0, /* 1 = S2R; 2 = R2S */ + name TEXT NOT NULL, /* Hook name. */ + sort INT NOT NULL, /* Function sorting order within the hook. */ + module TEXT NOT NULL, /* Module name. */ + fn TEXT NOT NULL, /* Function name. */ + kwargs TEXT, /* KW arguments as JSON blob. */ + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); +CREATE INDEX idx_hook_lookup ON tbl_hook (lang_id, dir); +CREATE INDEX idx_hookname_lookup ON tbl_hook (name); +CREATE INDEX idx_hook_sort ON tbl_hook (sort ASC); + +/* + * Ignore lists for R2S. + */ +CREATE TABLE tbl_ignore ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + rule TEXT NOT NULL, + features TINYINT, /* 1 = case insensitive; 2 = regular expression. */ + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); + +/* + * Double capitals. + */ +CREATE TABLE tbl_double_cap ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + rule TEXT NOT NULL, + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); + +/* + * Normalization rules. + */ +CREATE TABLE tbl_normalize ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + src TEXT NOT NULL, + dest TEXT NOT NULL, + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); + +/* + * Input options. + */ +CREATE TABLE tbl_option ( + id INTEGER PRIMARY KEY, + lang_id INTEGER NOT NULL, + name TEXT NOT NULL, + label TEXT NOT NULL, + description TEXT, + dtype TEXT, + options TEXT, + default_v TEXT, + + FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE +); +CREATE UNIQUE INDEX idx_option_lookup ON tbl_option (lang_id, name); + + diff --git a/scriptshifter/templates/index.html b/scriptshifter/templates/index.html index a9d974f..c7a0c7b 100644 --- a/scriptshifter/templates/index.html +++ b/scriptshifter/templates/index.html @@ -60,7 +60,7 @@ @@ -176,4 +176,4 @@

Submit feedback

{% endif %} -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py index ea55046..9b1e552 100644 --- a/scriptshifter/trans.py +++ b/scriptshifter/trans.py @@ -1,9 +1,13 @@ import logging +from importlib import import_module from re import compile from scriptshifter.exceptions import BREAK, CONT -from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table +from scriptshifter.tables import ( + BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH, + get_connection, get_lang_dcap, get_lang_general, get_lang_hooks, + get_lang_ignore, get_lang_map, get_lang_normalize) # Match multiple spaces. @@ -15,6 +19,8 @@ class Context: """ Context used within the transliteration and passed to hook functions. + + Use within a `with` block for proper cleanup. """ @property def src(self): @@ -28,23 +34,35 @@ def src(self): def src(self): raise NotImplementedError("Attribute is read-only.") - def __init__(self, src, general, langsec, options={}): + def __init__(self, lang, src, t_dir, options={}): """ Initialize a context. Args: src (str): The original text. Read-only. - general (dict): general section of the current config. - langsec (dict): Language configuration section being used. + t_dir (int): the direction of transliteration. + Either FEAT_R2S or FEAT_S2R. options (dict): extra options as a dict. """ + self.lang = lang self._src = src - self.general = general + self.t_dir = t_dir + self.conn = get_connection() + with self.conn as conn: + general = get_lang_general(conn, self.lang) + self.general = general["data"] + self.lang_id = general["id"] self.options = options - self.langsec = langsec + self.hooks = get_lang_hooks(self.conn, self.lang_id, self.t_dir) self.dest_ls = [] self.warnings = [] + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.conn.close() + def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): """ @@ -73,234 +91,225 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}): Return: str: The transliterated string. """ - source_str = "Latin" if t_dir == "r2s" else lang - target_str = lang if t_dir == "r2s" else "Latin" - logger.info(f"Transliteration is from {source_str} to {target_str}.") - - cfg = load_table(lang) - logger.info(f"Loaded table for {lang}.") - - # General directives. - general = cfg.get("general", {}) - - if t_dir == "s2r" and "script_to_roman" not in cfg: - raise NotImplementedError( - f"Script-to-Roman transliteration not yet supported for {lang}." - ) - elif t_dir == "r2s" and "roman_to_script" not in cfg: - raise NotImplementedError( - f"Roman-to-script transliteration not yet supported for {lang}." - ) + # Map t_dir to constant. + t_dir = FEAT_S2R if t_dir == "s2r" else FEAT_R2S - langsec = ( - cfg["script_to_roman"] if t_dir == "s2r" - else cfg["roman_to_script"]) - # langsec_dir = langsec.get("directives", {}) - langsec_hooks = langsec.get("hooks", {}) + source_str = "Roman" if t_dir == FEAT_R2S else lang + target_str = lang if t_dir == FEAT_R2S else "Roman" + logger.info(f"Transliteration is from {source_str} to {target_str}.") src = src.strip() options["capitalize"] = capitalize - ctx = Context(src, general, langsec, options) - - # This hook may take over the whole transliteration process or delegate it - # to some external process, and return the output string directly. - if _run_hook("post_config", ctx, langsec_hooks) == BREAK: - return getattr(ctx, "dest", ""), ctx.warnings - - if "normalize" in ctx.langsec: - _normalize_src(ctx) - - if _run_hook("post_normalize", ctx, langsec_hooks) == BREAK: - return getattr(ctx, "dest", ""), ctx.warnings - - # Loop through source characters. The increment of each loop depends on - # the length of the token that eventually matches. - ignore_list = langsec.get("ignore", []) # Only present in R2S - ctx.cur = 0 - word_boundary = langsec.get("word_boundary", WORD_BOUNDARY) - - while ctx.cur < len(ctx.src): - # Reset cursor position flags. - # Carry over extended "beginning of word" flag. - ctx.cur_flags = 0 - cur_char = ctx.src[ctx.cur] - - # Look for a word boundary and flag word beginning/end it if found. - if _is_bow(ctx.cur, ctx, word_boundary): - # Beginning of word. - logger.debug(f"Beginning of word at position {ctx.cur}.") - ctx.cur_flags |= BOW - if _is_eow(ctx.cur, ctx, word_boundary): - # End of word. - logger.debug(f"End of word at position {ctx.cur}.") - ctx.cur_flags |= EOW - - # This hook may skip the parsing of the current - # token or exit the scanning loop altogether. - hret = _run_hook("begin_input_token", ctx, langsec_hooks) - if hret == BREAK: - logger.debug("Breaking text scanning from hook signal.") - break - if hret == CONT: - logger.debug("Skipping scanning iteration from hook signal.") - continue - - # Check ignore list. Find as many subsequent ignore tokens - # as possible before moving on to looking for match tokens. - ctx.tk = None - while True: - ctx.ignoring = False - for ctx.tk in ignore_list: - hret = _run_hook("pre_ignore_token", ctx, langsec_hooks) - if hret == BREAK: - break - if hret == CONT: - continue + with Context(lang, src, t_dir, options) as ctx: + + if t_dir == FEAT_S2R and not ctx.general["has_s2r"]: + raise NotImplementedError( + f"Script-to-Roman not yet supported for {lang}." + ) + if t_dir == FEAT_R2S and not ctx.general["has_r2s"]: + raise NotImplementedError( + f"Roman-to-script not yet supported for {lang}." + ) + + # This hook may take over the whole transliteration process or delegate + # it to some external process, and return the output string directly. + if _run_hook("post_config", ctx) == BREAK: + return getattr(ctx, "dest", ""), ctx.warnings + + _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) + + if _run_hook("post_normalize", ctx) == BREAK: + return getattr(ctx, "dest", ""), ctx.warnings + + lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir)) + + # Loop through source characters. The increment of each loop depends on + # the length of the token that eventually matches. + ctx.cur = 0 + + while ctx.cur < len(ctx.src): + # Reset cursor position flags. + # Carry over extended "beginning of word" flag. + ctx.cur_flags = 0 + cur_char = ctx.src[ctx.cur] + + # Look for a word boundary and flag word beginning/end it if found. + if _is_bow(ctx.cur, ctx, WORD_BOUNDARY): + # Beginning of word. + logger.debug(f"Beginning of word at position {ctx.cur}.") + ctx.cur_flags |= BOW + if _is_eow(ctx.cur, ctx, WORD_BOUNDARY): + # End of word. + logger.debug(f"End of word at position {ctx.cur}.") + ctx.cur_flags |= EOW + + # This hook may skip the parsing of the current + # token or exit the scanning loop altogether. + hret = _run_hook("begin_input_token", ctx) + if hret == BREAK: + logger.debug("Breaking text scanning from hook signal.") + break + if hret == CONT: + logger.debug("Skipping scanning iteration from hook signal.") + continue - step = len(ctx.tk) - if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]: - # The position matches an ignore token. - hret = _run_hook("on_ignore_match", ctx, langsec_hooks) + # Check ignore list. Find as many subsequent ignore tokens + # as possible before moving on to looking for match tokens. + ctx.tk = None + while True: + ctx.ignoring = False + for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id): + hret = _run_hook("pre_ignore_token", ctx) if hret == BREAK: break if hret == CONT: continue - logger.info(f"Ignored token: {ctx.tk}") - ctx.dest_ls.append(ctx.tk) - ctx.cur += step - cur_char = ctx.src[ctx.cur] - ctx.ignoring = True + step = len(ctx.tk) + if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]: + # The position matches an ignore token. + hret = _run_hook("on_ignore_match", ctx) + if hret == BREAK: + break + if hret == CONT: + continue + + logger.info(f"Ignored token: {ctx.tk}") + ctx.dest_ls.append(ctx.tk) + ctx.cur += step + cur_char = ctx.src[ctx.cur] + ctx.ignoring = True + break + # We looked through all ignore tokens, not found any. Move on. + if not ctx.ignoring: break - # We looked through all ignore tokens, not found any. Move on. - if not ctx.ignoring: - break - # Otherwise, if we found a match, check if the next position may be - # ignored as well. - - delattr(ctx, "tk") - delattr(ctx, "ignoring") - - # Begin transliteration token lookup. - ctx.match = False - - for ctx.src_tk, ctx.dest_str in langsec["map"]: - hret = _run_hook("pre_tx_token", ctx, langsec_hooks) - if hret == BREAK: - break - if hret == CONT: - continue + # Otherwise, if we found a match, check if the next position + # may be ignored as well. - step = len(ctx.src_tk.content) - # If the token is longer than the remaining of the string, - # it surely won't match. - if ctx.cur + step > len(ctx.src): - continue + delattr(ctx, "tk") + delattr(ctx, "ignoring") - # If the first character of the token is greater (= higher code - # point value) than the current character, then break the loop - # without a match, because we know there won't be any more match - # due to the alphabetical ordering. - if ctx.src_tk.content[0] > cur_char: - logger.debug( - f"{ctx.src_tk.content} is after " - f"{ctx.src[ctx.cur:ctx.cur + step]}. Breaking loop.") - break + # Begin transliteration token lookup. + ctx.match = False - # If src_tk has a WB flag but the token is not at WB, skip. - if ( - (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW) - or - # Can't rely on EOW flag, we must check on the last character - # of the potential match. - (ctx.src_tk.flags & EOW and not _is_eow( - ctx.cur + step - 1, ctx, word_boundary)) - ): - continue - - # Longer tokens should be guaranteed to be scanned before their - # substrings at this point. - # Similarly, flagged tokens are evaluated first. - if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]: - ctx.match = True - # This hook may skip this token or break out of the token - # lookup for the current position. - hret = _run_hook("on_tx_token_match", ctx, langsec_hooks) + for ctx.src_tk, ctx.dest_str in lang_map: + hret = _run_hook("pre_tx_token", ctx) if hret == BREAK: break if hret == CONT: continue - # A match is found. Stop scanning tokens, append result, and - # proceed scanning the source. + step = len(ctx.src_tk.content) + # If the token is longer than the remaining of the string, + # it surely won't match. + if ctx.cur + step > len(ctx.src): + continue - # Capitalization. + # If the first character of the token is greater (= higher code + # point value) than the current character, then break the loop + # without a match, because we know there won't be any more + # match due to the alphabetical ordering. + if ctx.src_tk.content[0] > cur_char: + logger.debug( + f"{ctx.src_tk.content} is after " + f"{ctx.src[ctx.cur:ctx.cur + step]}. " + "Breaking loop.") + break + + # If src_tk has a WB flag but the token is not at WB, skip. if ( - (ctx.options["capitalize"] == "first" and ctx.cur == 0) + (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW) or - ( - ctx.options["capitalize"] == "all" - and ctx.cur_flags & BOW - ) + # Can't rely on EOW flag, we must check on the last + # character of the potential match. + (ctx.src_tk.flags & EOW and not _is_eow( + ctx.cur + step - 1, ctx, WORD_BOUNDARY)) ): - logger.info("Capitalizing token.") - double_cap = False - for dcap_rule in ctx.langsec.get("double_cap", []): - if ctx.dest_str == dcap_rule: - ctx.dest_str = ctx.dest_str.upper() - double_cap = True - break - if not double_cap: - ctx.dest_str = ( - ctx.dest_str[0].upper() + ctx.dest_str[1:]) + continue - ctx.dest_ls.append(ctx.dest_str) - ctx.cur += step - break + # Longer tokens should be guaranteed to be scanned before their + # substrings at this point. + # Similarly, flagged tokens are evaluated first. + if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]: + ctx.match = True + # This hook may skip this token or break out of the token + # lookup for the current position. + hret = _run_hook("on_tx_token_match", ctx) + if hret == BREAK: + break + if hret == CONT: + continue - if ctx.match is False: - delattr(ctx, "match") - hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks) - if hret == BREAK: - break - if hret == CONT: - continue + # A match is found. Stop scanning tokens, append result, + # and proceed scanning the source. + + # Capitalization. + if ( + (ctx.options["capitalize"] == "first" and ctx.cur == 0) + or + ( + ctx.options["capitalize"] == "all" + and ctx.cur_flags & BOW + ) + ): + logger.info("Capitalizing token.") + double_cap = False + for dcap_rule in get_lang_dcap(ctx.conn, ctx.lang_id): + if ctx.dest_str == dcap_rule: + ctx.dest_str = ctx.dest_str.upper() + double_cap = True + break + if not double_cap: + ctx.dest_str = ( + ctx.dest_str[0].upper() + ctx.dest_str[1:]) + + ctx.dest_ls.append(ctx.dest_str) + ctx.cur += step + break + + if ctx.match is False: + delattr(ctx, "match") + hret = _run_hook("on_no_tx_token_match", ctx) + if hret == BREAK: + break + if hret == CONT: + continue - # No match found. Copy non-mapped character (one at a time). - logger.info( - f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) " - f"at position {ctx.cur} is not mapped.") - ctx.dest_ls.append(cur_char) - ctx.cur += 1 - else: - delattr(ctx, "match") - delattr(ctx, "cur_flags") + # No match found. Copy non-mapped character (one at a time). + logger.info( + f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) " + f"at position {ctx.cur} is not mapped.") + ctx.dest_ls.append(cur_char) + ctx.cur += 1 + else: + delattr(ctx, "match") + delattr(ctx, "cur_flags") - delattr(ctx, "cur") + delattr(ctx, "cur") - # This hook may take care of the assembly and cause the function to return - # its own return value. - hret = _run_hook("pre_assembly", ctx, langsec_hooks) - if hret is not None: - return hret, ctx.warnings + # This hook may take care of the assembly and cause the function to + # return its own return value. + hret = _run_hook("pre_assembly", ctx) + if hret is not None: + return hret, ctx.warnings - logger.debug(f"Output list: {ctx.dest_ls}") - ctx.dest = "".join(ctx.dest_ls) + logger.debug(f"Output list: {ctx.dest_ls}") + ctx.dest = "".join(ctx.dest_ls) - # This hook may reassign the output string and/or cause the function to - # return it immediately. - hret = _run_hook("post_assembly", ctx, langsec_hooks) - if hret is not None: - return hret, ctx.warnings + # This hook may reassign the output string and/or cause the function to + # return it immediately. + hret = _run_hook("post_assembly", ctx) + if hret is not None: + return hret, ctx.warnings - # Strip multiple spaces and leading/trailing whitespace. - ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip()) + # Strip multiple spaces and leading/trailing whitespace. + ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip()) - return ctx.dest, ctx.warnings + return ctx.dest, ctx.warnings -def _normalize_src(ctx): - for nk, nv in ctx.langsec.get("normalize", {}).items(): +def _normalize_src(ctx, norm_rules): + for nk, nv in norm_rules.items(): ctx._src = ctx.src.replace(nk, nv) logger.debug(f"Normalized source: {ctx.src}") @@ -317,11 +326,13 @@ def _is_eow(cur, ctx, word_boundary): ) and (ctx.src[cur] not in word_boundary) -def _run_hook(hname, ctx, hooks): +def _run_hook(hname, ctx): ret = None - for hook_def in hooks.get(hname, []): - kwargs = hook_def[1] if len(hook_def) > 1 else {} - ret = hook_def[0](ctx, **kwargs) + for hook_def in ctx.hooks.get(hname, []): + fn = getattr( + import_module("." + hook_def["module_name"], HOOK_PKG_PATH), + hook_def["fn_name"]) + ret = fn(ctx, **hook_def["kwargs"]) if ret in (BREAK, CONT): # This will stop parsing hooks functions and tell the caller to # break out of the outer loop or skip iteration. diff --git a/sscli b/sscli new file mode 100755 index 0000000..154aaf2 --- /dev/null +++ b/sscli @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +__doc__ = """ Scriptshifter command line interface. """ + + +import click + +from glob import glob +from os import path + +from scriptshifter import DB_PATH +from scriptshifter.tables import init_db as _init_db +from tests import test_sample + + +@click.group() +def cli(): + """ Scriptshifter CLI. """ + pass + + +@cli.group(name="admin") +def admin_grp(): + """ Admin operations. """ + pass + + +@admin_grp.command() +def init_db(): + """ Initialize SS database. """ + _init_db() + + click.echo(f"Initialized Scriptshifter DB in {DB_PATH}") + + +@cli.group(name="test") +def test_grp(): + """ Test operations. """ + pass + + +@test_grp.command() +def list_samples(): + """ List string sample sets that can be tested. """ + path_ptn = path.join( + path.dirname(path.realpath(__file__)), + "tests", "data", "script_samples", "*.csv") + + click.echo("Sample string sets available for batch testing:") + for fn in glob(path_ptn): + click.echo(path.splitext(path.basename(fn))[0]) + + +@test_grp.command() +@click.argument("lang") +def samples(lang): + """ + Test sample strings for language LANG. + + LANG must match one of the names obtained with `test list-samples` command. + + The command will generate a test report file. + """ + return test_sample(lang) + + +@cli.group(name="trans") +def trans_grp(): + """ Transliteration and transcription operations. """ + pass + + +if __name__ == "__main__": + cli() diff --git a/tests/__init__.py b/tests/__init__.py index aaaa1b7..e4cde3e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -20,7 +20,8 @@ def reload_tables(): reload(scriptshifter.tables) # Reload new config dir. from scriptshifter import tables tables.list_tables.cache_clear() - tables.load_table.cache_clear() + tables.get_language.cache_clear() + tables.get_lang_map.cache_clear() return tables diff --git a/tests/data/script_samples/arabic.csv b/tests/data/script_samples/arabic.csv new file mode 100644 index 0000000..b459dab --- /dev/null +++ b/tests/data/script_samples/arabic.csv @@ -0,0 +1,3 @@ +arabic,نظام الحكم في عمان : من إمامة الإنتخاب الى السلطنة الوراثية,Niẓām al-ḥukm fī ʻUmān : min imāmat al-intikhāb ilá al-salṭanah al-wirāthīyah,, +arabic,ندوة علاقات مصر بدول حوض النيل في ظل رئاسة مصر للاتحاد الإفريقي,Nadwat ʻAlāqāt Miṣr bi-Duwal Ḥawḍ al-Nīl fī ẓill Riʼāsat Miṣr lil-Ittiḥād al-Ifrīqī,, +arabic,تهذيب البيان والجمع في الفرق بين التكليف والوضع,Tahdhīb al-bayān wa-al-jamʻ fī al-farq bayna al-taklīf wa-al-waḍʻ,, diff --git a/tests/data/script_samples/arabic2.csv b/tests/data/script_samples/arabic2.csv new file mode 100644 index 0000000..44dee39 --- /dev/null +++ b/tests/data/script_samples/arabic2.csv @@ -0,0 +1,94 @@ +arabic,قضايا فكرية و سياسية باقلام كردية عراقية ,Qaḍāyā fikrīyah wa siyāsīyah bi-aqlām Kurdīyah ʻIrāqīyah,, +arabic,‏الأستاذ الدكتور عماد الجواهري؛ مراجعة و تقديم الأستاذ الدكتور عبد الفتاح علي البوتاني,al-Ustādh al-Duktūr ʻImād al-Jawāhirī; murājaʻat wa taqdīm al-Ustādh al-Duktūr ʻAbd al-Fattāḥ ʻAlī al-Būtānī,, +arabic,العلاقة الشيعية - الكوردية ومستقبلها,al-ʻAlāqah al-Shīʻīyah - al-Kūrdīyah wa-mustaqbaluhā,, +arabic,مركز دراسات رووداو,Markaz Dirāsāt Rūwūdāw,, +arabic,ماذا يخبئ الغربال في السياسة العراقية,Mādhā yukhabbiʼ al-ghurbāl fī al-siyāsah al-ʻIrāqīyah,, +arabic,الحزب الشيوعي العراقي .. المكونات السياسية .. الحكومة,al-Ḥizb al-Shuyūʻī al-ʻIrāqī .. al-mukawwināt al-siyāsīyah .. al-ḥukūmah,, +arabic,الدولة الأموية في الشام,al-Dawlah al-Umawīyah fī al-Shām,, +arabic,تأليف أنيس زكريا النصولي,taʼlīf Anīs Zakarīyā al-Nuṣūlī.,, +arabic,الدين وسياسة الدولة في بلاد الرافدين في ضوء النصوص المسمارية، (٢٨٠٠ ق.م-٥٣٩ ق.م) ,"al-Dīn wa-siyāsat al-dawlah fī bilād al-Rāfidayn fī ḍawʼ al-nuṣūṣ al-mismārīyah, (2800 Q.M-539 Q.M)",, +arabic,المدن والموانيء التجارية في شرق الجزيرة العربية منذ بداية الالف الثالث ق.م حتى نهاية الالف الاول ق.م ,al-Mudun wa-al-mawānīʼ al-tijārīyah fī sharq al-jazīrah al-ʻArabīyah mundhu bidāyat al-alf al-thālith Q.M ḥattá nihāyat al-alf al-awwal Q.M,, +arabic,أمير الإنسانية وقائد الدبلماسية ,Amīr al-insānīyah wa-qāʼid al-diblumāsīyah,, +arabic,النقد الادبي واللغوي المعاصر ,al-Naqd al-adabī wa-al-lughawī al-muʻāṣir,, +arabic,جدلية الاصالة والتجديد : المؤتمر النقدي الرابع والعشرون,Jadalīyat al-aṣālah wa-al-tajdīd : al-muʼtamar al-naqdī al-rābiʻ wa-al-ʻishrūn,, +arabic,أماني سراج عبدالوهاب أبوزيد,Amānī Sirāj ʻAbd al-Wahhāb Abū Zayd,, +arabic,المدن والموانيء التجارية في شرق الجزيرة العربية منذ بداية الالف الثالث ق.م حتى نهاية الالف الاول ق.م,al-Mudun wa-al-mawānīʼ al-tijārīyah fī sharq al-Jazīrah al-ʻArabīyah mundhu bidāyat al-alf al-thālith Q.M ḥattá nihāyat al-alf al-awwal Q.M,, +arabic,محمد صوضان,Muḥammad Ṣawḍān,, +arabic,كتاب سفينة السعادة لاهل الضعف والنجادة في مديح النبي، المعروفة، بالعشرينيات ‏ ," Kitāb Safīnat al-saʻādah li-ahl al-ḍaʻf wa-al-najādah fī madīḥ al-Nabī, al-maʻrūfah, bi-al-ʻIshrīniyāt",, +arabic,من الشريعة الموروثة إلى الإنسان الخليفة ,Min al-sharīʻah al-mawrūthah ilá al-insān al-khalīfah,, +arabic,إعداد محمد هشام بوعتور, iʻdād Muḥammad Hishām Bū ʻAttūr,, +arabic,موسوعة الحكايات الخرافية الفلسطينية ,Mawsūʻat al-ḥikāyāt al-khurāfīyah al-Filasṭīnīyah,, +arabic,مؤسسة تامر للتعليم المجتمعي,Muʼassasat Tāmir lil-Taʻlīm al-Mujtamaʻī,, +arabic,نصوص ودراسة في ‌الحكاية الشعبية الفلسطينية, nuṣūṣ wa-dirāsah fi al-ḥikāyah al-shaʻbīyah al-Filasṭīnīyah,, +arabic,تأليف إبراهيم مهوي و شريف كناعنه ,taʼlīf Ibrāhīm Muhawwī wa-Sharīf Kanāʻinah,, +arabic,التراث الفلسطيني بين الطمس والاحياء, al-Turāth al-Filasṭīnī bayna al-ṭams wa-al-iḥyāʼ,, +arabic,أشرف على تحريرها منعم حداد,ashrafa ʻalá taḥrīrihā Munʻim Ḥaddād,, +arabic,من تراثنا الشعبي في السهل الساحلي الفلسطيني ,Min turāthinā al-shaʻbī fī al-sahl al-sāḥilī al-Filastīnī,, +arabic,بقلم حسن محمد عوض,bi-qalam Ḥasan Muḥammad ʻAwaḍ,, +arabic,تاريخ ما لم يذكره التاريخ,Tārīkh mā lam yadhkurhu al-tārīkh,, +arabic,دراسة ميدانية فى التراث الشعبى الفلسطينى,dirāsah maydānīyah fī al-turāth al-shaʻbī al-Filasṭīnī ,, +arabic,بيت الفلاح الفلسطيني, Bayt al-falāḥ al-Filasṭīnī,, +arabic,معان ثقافية وعادات وتقاليد اجتماعية، اثاث وفراش وادوات,"maʻānin thaqāfīyah wa-ʻādāt wa-taqālīd ijtimāʻīyah, athāth wa-firāsh wa-adawāt",, +arabic,الحزازير والألعاب الشعبية الفلسطينية,al-Ḥazāzīr wa-al-alʻāb al-shaʻbīyah al-Filasṭīnīyah,, +arabic,المرأة في المثل الشعبي في الأردن وفلسطين,al-Marʼah fī al-mathal al-shaʻbī fī al-Urdun wa-Filasṭīn,, +arabic,الأحاجي والالغاز الادبية ,al-Aḥājī wa-al-alghāz al-adabīyah,, +arabic,فصول الحياة في قريتي,fuṣūl al-ḥayāh fī qaryatī,, +arabic,قرية الدمينة الشرقية بين الماضي والحاضر,Qaryat al-Dumaynah al-Sharqīyah bayna al-māḍī wa-al-ḥāḍir,, +arabic,الألعاب الشعبية في الجزيرة السورية,al-Alʻāb al-shaʻbīyah fī al-Jazīrah al-Sūrīyah,, +arabic,وزارة الثقافة، منشورات الهيئه العامة السورية للكتاب,"Wizārat al-Thaqāfah, Manshūrāt al-Hayʼah al-ʻĀmmah al-Sūrīyah lil-Kitāb",, +arabic,طرائف الأمس غرائب اليوم,Ṭarāʼif al-ams gharāʼib al-Yawm,, +arabic,صور من حياة النبك وجبل القلمون في أواسط القرن التاسع عشر,ṣuwar min ḥayāt al-Nabk wa-Jabal al-Qalamūn fī awāsiṭ al-qarn al-tāsiʻ ʻashar,, +arabic,ولدت مرتين,Wulidtu marratayn,, +arabic,من حكايا الدمع في سوريا,min Ḥakāyā al-damʻ fī Sūriyā,, +arabic,العين والماء والفخار في التراث الساحلي الريفي,al-ʻAyn wa-al-māʼ wa-al-fukhkhār fī al-turāth al-sāḥilī al-rīfī,, +arabic,المواسم التقليدية بمنطقة الأبيض سيدي الشيخ، الوعدات,"al-Mawāsim al-taqlīdīyah bi-minṭaqat al-Abyaḍ Sīdī al-Shaykh, al-Waʻdāt",, +arabic,فضاءات تلقي الادب الشعبي,Faḍāʼāt talaqqī al-adab al-shaʻbī,, +arabic,المجتمع الجزائري وفعالياته في العهد العثماني,al-Mujtamaʻ al-Jazāʼirī wa-faʻʻālīyātuhu fī al-ʻahd al-ʻUthmānī,, +arabic,بدو الطوارق بين الثبات والتغير ,Badw al-Ṭawāriq bayna al-thabāt wa-al-taghayyur,, +arabic,النظم الإجتماعية والتغيرات المرافقة للمد العربي,al-nuẓum al-ijtimāʻīyah wa-al-taghayyurāt al-murāfiqah lil-madd al-ʻArabī,, +arabic,لماذا يصحو مارد الهضبة ويغفو مارد السهل,Li-mādhā yaṣʹḥū mārid al-haḍabah wa-yaghfū mārid al-sahl,, +arabic,رؤى الحداثة وآفاق التحولات في الخطاب الأدبي الأردني الحداثي,ruʼá al-ḥadāthah wa-āfāq al-taḥawwulāt fī al-khiṭāb al-Adabī al-Urdunī al-ḥadāthī,, +arabic,الحقيبة الملكية على الطائر الميمون ‏ ,al-Ḥaqībah al-malakīyah ʻalá al-ṭāʼir al-maymūn,, +arabic,عيسى الناعوري وجهوده في مجال الدراسات الادبية والنقدية,ʻĪsá al-Nāʻūrī wa-juhūduh fī majāl al-dirāsāt al-adabīyah wa-al-naqdīyah,, +arabic,أقحوان على ضفاف النهر ,Uqḥuwān ʻalá ḍifāf al-nahr,, +arabic,صورة المرأة في... السرد النسوي الأردني,Ṣūrat al-marʼah fī ... al-sard al-niswī al-Urdunī,, +arabic,آراء ونصوص في تجربته الادبية,Ārāʼ wa-nuṣūṣ fī tajribatih al-adabīyah,, +arabic,مدخل الى أدبنا المعاصر,Madkhal ilá adabinā al-muʻāṣir,, +arabic,صاحب المئة كتاب والستين عاما في خدمة التربية والتعليم,ṣāḥib al-miʼat kitāb wa-al-sittīn ʻāman fī khidmat al-tarbiyah wa-al-taʻlīm,, +arabic,خمسة رواد يحاورون العصر,khamsat rūwād yuḥāwirūn al-ʻaṣr,, +arabic,حوار مع رواد النهضة العربية,Ḥiwār maʻa rūwād al-nahḍah al-ʻArabīyah,, +arabic,أعلام الحركة الادبية في الرقة,Aʻlām al-ḥarakah al-adabīyah fī al-Raqqah,, +arabic,دراسة تحليلية في أدب الأطفال لدى الكرد في سوريا وأبرز نماذجه المدونة,dirāsah taḥlīlīyah fī adab al-aṭfāl ladá al-Kurd fī Sūriyā wa-abraz namādhijihi al-mudawwanah,, +arabic,دراسات ومقالات حول حياة الكتاب والكتاب, dirāsāt wa-maqālāt ḥawla ḥayāt al-kuttāb wa-al-kitāb,, +arabic,القصص القرآني : إيحاؤه ونفحاته ,al-Qaṣaṣ al-Qurʼānī : īḥāʼuhu wa-nafaḥātuh ,, +arabic,للسائلين عن، أخلاق وطبائع بني إسرائيل في قصة يوسف عليه السلام,"Lil-sāʼilīn ʻan, Akhlāq wa-ṭabāʼiʻ Banī Isrāʼīl fī qiṣṣat Yūsuf ʻalayhi al-Salām",, +arabic,إبراهيم الدسوقي عبد الرحمن,Ibrāhīm al-Dasūqī ʻAbd al-Raḥmān,, +arabic,لا تكن كابني آدم,Lā takun ka-ibnay Ādam ,, +arabic,لا قاتلا ولا مقتولا,lā qātilan wa-lā maqtūlan,, +arabic,الجانب الفني في القصة القرآنية,al-jānib al-fannī fī al-qiṣṣah al-Qurʼānīyah,, +arabic,منهجها، وأسس بنائها,"manhajuhā, wa-usus bināʼihā ",, +arabic,المبادىء التربوية والأسس النفسية في القصص القرآني,al-Mabādiʼ al-tarbawīyah wa-al-usus al-nafsīyah fī al-qaṣaṣ al-Qurʼānī ,, +arabic,الابتلاءات الشديدة عند مخالفة الشريعة,al-Ibtilāʼāt al-shadīdah ʻinda mukhālafat al-Sharīʻah,, +arabic,للداعية الإسلامي الشيخ محمد ياسين أبو يحيى,lil-Dāʻiyah al-Islāmī al-Shaykh Muḥammad Yāsīn Abū Yaḥyá,, +arabic,روضة المشتاقين في فضائل الأنبياء والمرسلين وشيء من أخبارهم,Rawḍat al-mushtāqīn fī faḍāʼil al-anbiyāʼ wa-al-mursalīn wa-shayʼ min akhbārihim,, +arabic,يحيى خذ الكتاب بقوة,Yaḥyá khudh al-kitāb bi-qūwah,, +arabic,خصائص التراكيب ودلالاتها في القصص القرآني,Khaṣāʼiṣ al-tarākīb wa-dalālātuhā fī al-qaṣaṣ al-Qurʼānī,, +arabic,الخطيئة والصراع,al-khaṭīʼah wa-al-ṣirāʻ,, +arabic,اللاموضوعية عند المفسرين :‏,al-Lāmawḍūʻīyah ʻinda al-mufassirīn ,, +arabic,القصص القرآني بين الآباء والابناء :‏ ,al-Qaṣaṣ al-Qurʼānī bayna al-ābāʼ wa-al-abnāʼ ,, +arabic,الاتساع النصي في القصص القرآني بين الاستباق والاسترجاع ‏ ,al-Ittisāʻ al-naṣṣī fī al-qaṣaṣ al-Qurʼānī bayna al-istibāq wa-al-istirjāʻ,, +arabic,فأزلهما الشيطان عنها فأخرجهما مما كانا فيه,fa-azallahumā al-Shayṭān ʻanhā fa-akhrajahumā mimmā kānā fīhi,, +arabic,آثار المشتق البليغ من قصة يوسف الصديق ,Āthār al-mushtaqq al-balīgh min qiṣṣat Yūsuf al-Ṣiddīq,, +arabic,الجامع الصحيح في القصص النبوي,al-Jāmiʻ al-ṣaḥīḥ fī al-qaṣaṣ al-Nabawī,, +arabic,يطبع لاول مرة محققا عا نسخة الحافظ الذهبي التي كتبها بخطة,Yuṭbaʻu li-awwal marrah muḥaqqiqan ʻan nuskhah al-Ḥāfiẓ al-Dhahabī allatī katabahā bi-khuṭṭat.,, +arabic,,Yuṭbaʻu li-awwal marrah ʻan nuskhah Nafīsah manqūlah bi-khaṭṭ al-muʼallif bi-khaṭṭ al-muʼallif,, +arabic,العربية ولهجاتها ,al-ʻArabīyah wa-lahajātuhā,, +arabic,اللغة المهرية المعاصرة بين عربيتين,al-Lughah al-Mahrīyah al-muʻāṣirah bayna ʻArabīyatayn,, +arabic,نحو عربية ميسرة‏, Naḥwa ʻArabīyah muyassarah,, +arabic,لغات القبائل في كتب إعراب القرآن ومعانيه,Lughāt al-qabāʼil fī kutub iʻrāb al-Qurʼān wa-maʻānīh,, +arabic,الأدب الجاهلي بين لهجات القبائل وللغة الموحدة,al-adab al-Jāhilī bayna Lahajāt al-qabāʼil wa-al-lughah al-muwaḥḥadah,, +arabic,التحليل العام للغة العوام,al-Tahḷīl al-ʻāmm li-lughat al-ʻawāmm,, +arabic,تاريخ الدعوة إلى العامية وآثارها في مصر ‏ ,Tārīkh al-Daʻwah ilá al-ʻāmmīyah wa-āthāruhā fī Miṣr,, +arabic,الفصيح الذي حفظته العامية العراقية بين الدراسة والتطبيق,al-faṣīḥ alladhī ḥafiẓatʹhu al-ʻāmmīyah al-ʻIrāqīyah bayna al-dirāsah wa-al-taṭbīq,, +arabic,ويلي ذلك معجم بألفاظ اللهجة الشائعة في العراق,wa-yalī dhālika Muʻjam bi-alfāẓ al-lahjah al-shāʼiʻah fī al-ʻIrāq,, +arabic,كلمات فارسية مستعملة في عامية الموصل وفي انحاء العراق,Kalimāt Fārisīyah mustaʻmalah fī ʻāmmīyat al-Mūṣil wa-fī anḥāʼ al-ʻIrāq,, diff --git a/tests/test01_cfg.py b/tests/test01_cfg.py index ca70d05..c861d91 100644 --- a/tests/test01_cfg.py +++ b/tests/test01_cfg.py @@ -113,7 +113,7 @@ def test_rot3(self): tbl["script_to_roman"]["hooks"], { "begin_input_token": [ - (scriptshifter.hooks.test.rotate, {"n": -3}) + ("test", scriptshifter.hooks.test.rotate, {"n": -3}) ] })