diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index bb2e04a2..f2c5f0c6 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -10,13 +10,14 @@ _SUPPORTED_LANGUAGES = ("ca", "cs", "da", "de", "en", "es", "fr", "hu", - "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa") + "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa", + "eu-eu", "syr") _SUPPORTED_FULL_LOCALIZATIONS = ("ca-es", "cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", "pl-pl", "fa-ir", "pt-pt", "ru-ru", "sl-si", - "sv-se", "tr-tr") + "sv-se", "syr-sy", "tr-tr", "eu-eu") _DEFAULT_FULL_LANG_CODES = {'ca': 'ca-es', 'cs': 'cs-cz', @@ -24,6 +25,7 @@ 'de': 'de-de', 'en': 'en-us', 'es': 'es-es', + 'eu': 'eu-eu', 'fa': 'fa-ir', 'fr': 'fr-fr', 'hu': 'hu-hu', @@ -34,6 +36,7 @@ 'ru': 'ru-ru', 'sl': 'sl-si', 'sv': 'sv-se', + 'syr': 'syr-sy', 'tr': 'tr-tr'} __default_lang = None diff --git a/lingua_franca/lang/common_data_syr.py b/lingua_franca/lang/common_data_syr.py new file mode 100644 index 00000000..4289a996 --- /dev/null +++ b/lingua_franca/lang/common_data_syr.py @@ -0,0 +1,185 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_FUNCTION_NOT_IMPLEMENTED_WARNING = "ܐܗܐ ܣܘܥܪܢܐ ܠܐ ܝܠܗ ܦܝܫܐ ܬܘܡܡܐ ܒܠܫܢܐ ܣܘܪܝܝܐ" + +# Word rules for gender +_SYRIAC_FEMALE_ENDINGS = ["ܬܐ"] +_SYRIAC_MALE_ENDINGS = ["ܐ"] + +# Special cases, word lookup for words not covered by above rule + +# Masculine gender denotes names of: +# - rivers, islands, days of the week (except:Saturday and Sunday) +# - words where the letter ܬ does not appear as a suffix, but as part of +# the root (ܒܝܬܐ، ܡܘܬܐ) +# - loanwords with penultimate letter ܬ referring to masculine gender +# such as ܐܟܬܐ + +_SYRIAC_GENDERED_NOUNS_EXCEPTIONS = { + "ܥܪܘܒܬܐ": "f", + "ܫܒܬܐ": "f", + "ܕܩܠܬ": "m", + "ܦܪܬ": "m", + "ܒܝܬܐ": "m", + "ܡܘܬܐ": "m" +} + +_SYRIAC_ONES = [ + "", + "ܚܕ", + "ܬܪܝܢ", + "ܬܠܬܐ", + "ܐܪܒܥܐ", + "ܚܡܫܐ", + "ܫܬܐ", + "ܫܒܥܐ", + "ܬܡܢܝܐ", + "ܬܫܥܐ", + "ܥܣܪܐ", + "ܚܕܥܣܪ", + "ܬܪܥܣܪ", + "ܬܠܬܥܣܪ", + "ܐܪܒܥܣܪ", + "ܚܡܫܥܣܪ", + "ܫܬܥܣܪ", + "ܫܒܥܣܪ", + "ܬܡܢܥܣܪ", + "ܬܫܥܣܪ" +] + +_SYRIAC_ONES_FEM = [ + "", + "ܚܕܐ", + "ܬܪܬܝܢ", + "ܬܠܬ", + "ܐܪܒܥ", + "ܚܡܫ", + "ܫܬ", + "ܫܒܥ", + "ܬܡܢܐ", + "ܬܫܥ" +] + +_SYRIAC_TENS = [ + "", + "ܥܣܪܐ", + "ܥܣܪܝܢ", + "ܬܠܬܝܢ", + "ܐܪܒܥܝܢ", + "ܚܡܫܝܢ", + "ܫܬܝܢ", + "ܫܒܥܝܢ", + "ܬܡܢܝܢ", + "ܬܫܥܝܢ" +] + +_SYRIAC_HUNDREDS = [ + "", + "ܡܐܐ", + "ܬܪܝܢܡܐܐ", + "ܬܠܬܡܐܐ", + "ܐܪܒܥܡܐܐ", + "ܚܡܫܡܐܐ", + "ܫܬܡܐܐ", + "ܫܒܥܡܐܐ", + "ܬܡܢܡܐܐ", + "ܬܫܥܡܐܐ" +] + +_SYRIAC_LARGE = [ + "", + "ܐܠܦܐ", + "ܡܠܝܘܢܐ", + "ܡܠܝܪܐ", + "ܒܠܝܘܢܐ", + "ܒܠܝܪܐ" +] + +_SYRIAC_ORDINAL_BASE = { + 1: 'ܩܕܡܝܐ', + 2: 'ܬܪܝܢܐ', + 3: 'ܬܠܝܬܝܐ', + 4: 'ܪܒܝܥܝܐ', + 5: 'ܚܡܝܫܝܐ', + 6: 'ܫܬܝܬܝܐ', + 7: 'ܫܒܝܥܝܐ', + 8: 'ܬܡܝܢܝܐ', + 9: 'ܬܫܝܥܝܐ', + 10: 'ܥܣܝܪܝܐ', + 11: 'ܚܕܥܣܝܪܝܐ', + 12: 'ܬܪܥܣܝܪܝܐ', + 13: 'ܬܠܬܥܣܝܪܝܐ', + 14: 'ܐܪܒܥܣܝܪܝܐ', + 15: 'ܚܡܫܥܣܝܪܝܐ', + 16: 'ܫܬܥܣܝܪܝܐ', + 17: 'ܫܒܥܣܝܪܝܐ', + 18: 'ܬܡܢܥܣܝܪܝܐ', + 19: 'ܬܫܥܣܝܪܝܐ', + 20: 'ܥܣܪܝܢܝܐ', + 30: 'ܬܠܬܝܢܝܐ', + 40: 'ܐܪܒܥܝܢܝܐ', + 50: 'ܚܡܫܝܢܝܐ', + 60: 'ܫܬܝܢܝܐ', + 70: 'ܫܒܥܝܢܝܐ', + 80: 'ܬܡܢܝܢܝܐ', + 90: 'ܬܫܥܝܢܝܐ', + 100: 'ܐܡܝܐ', + 200: 'ܬܪܝܢܡܝܐ', + 300: 'ܬܠܬܡܝܐ', + 400: 'ܐܪܒܥܡܝܐ', + 500: 'ܚܡܫܡܝܐ', + 600: 'ܫܬܡܝܐ', + 700: 'ܫܒܥܡܝܐ', + 800: 'ܬܡܢܡܝܐ', + 900: 'ܬܫܥܡܝܐ', + 1000: 'ܐܠܦܝܐ', + 10000: 'ܪܒܘܬܢܝܐ' +} + +_SYRIAC_FRACTIONS = { + 3: "ܬܘܠܬܐ", + 4: "ܪܘܒܥܐ", + 5: "ܚܘܡܫܐ", + 6: "ܫܘܬܬܐ", + 7: "ܫܘܒܥܐ", + 8: "ܬܘܡܢܐ", + 9: "ܬܘܫܥܐ", + 10: "ܥܘܣܪܐ", + 20: "ܚܕ ܡܢ ܥܣܪܝܢ", + 30: "ܚܕ ܡܢ ܬܠܬܝܢ", + 50: "ܚܕ ܡܢ ܚܡܫܝܢ", + 100: "ܚܕ ܡܢ ܡܐܐ", + 1000: "ܚܕ ܡܢ ܐܠܦܐ" +} + +_SYRIAC_FRACTIONS_HALF = [ + "ܦܠܓܐ", + "ܦܠܓܗ", + "ܦܠܓܘ", + "ܦܠܓܘܬ" +] + +_SYRIAC_FRAC = ["", "ܥܣܪܐ", "ܡܐܐ"] +_SYRIAC_FRAC_BIG = ["", "ܐܠܦܐ", "ܡܠܝܘܢܐ", "ܒܠܝܘܢܐ" ] + +# Fraction separator +_SYRIAC_SEPARATOR = " ܡܢ " + +# Conjoiner +_SYRIAC_CONJOINER = " ܘ" diff --git a/lingua_franca/lang/format_syr.py b/lingua_franca/lang/format_syr.py new file mode 100644 index 00000000..01bd6105 --- /dev/null +++ b/lingua_franca/lang/format_syr.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_syr import \ + _SYRIAC_ONES, _SYRIAC_TENS, _SYRIAC_HUNDREDS, _SYRIAC_LARGE, \ + _SYRIAC_ORDINAL_BASE, _SYRIAC_SEPARATOR, \ + _SYRIAC_CONJOINER, _SYRIAC_FRAC, _SYRIAC_FRAC_BIG +import math +import unicodedata +from lingua_franca.internal import lookup_variant +from enum import IntEnum +from functools import wraps + + +def nice_number_syr(number, speech=True, denominators=range(1, 21), variant=None): + """ Syriac helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + ### For text + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return_string = '{} {}/{}'.format(whole, num, den) + return return_string + + ### For speech + + # If the number is not a fraction, return the whole number + if num == 0: + return str(whole) + + # If the whole number is 0 + if whole == 0: + # Special case for half for 0.5 + if num == 1 and den == 2: + return_string = 'ܦܠܓܐ' + else: + return_string = '{} ܡܢ {}'.format(_lookup_syriac_word(num), _lookup_syriac_word(den)) + + # If the whole number is > 0 + elif num == 1 and den == 2: + # Special case for half for whole numbers with 0.5 + return_string = '{} ܘܦܠܓܐ'.format(whole) + else: + return_string = '{} ܘ{} ܡܢ {}'.format(whole, _lookup_syriac_word(num), _lookup_syriac_word(den)) + + return return_string + +def _unpack_number_to_parts(value, _precision): + """ + Given a number, break it down to its whole number and fractional number parts + + Returns: + (pre): The whole number + (post): The fractional number + (_precision): The precision + """ + pre = int(value) + + post = abs(value - pre) * 10**_precision + + if abs(round(post) - post) < 0.01: + # We generally floor all values beyond our precision (rather than + # rounding), but in cases where we have something like 1.239999999, + # which is probably due to python's handling of floats, we actually + # want to consider it as 1.24 instead of 1.23 + post = int(round(post)) + else: + post = int(math.floor(post)) + + while post != 0: + x, y = divmod(post, 10) + if y != 0: + break + post = x + _precision -= 1 + + return pre, post, _precision + +def _lookup_syriac_word(number, ordinals=False): + """ + Lookup up the appropriate Syriac word given a number and then create a string based + on the number range + + Args: + num(float or int): the number to pronounce (under 100) + ordinals (bool): pronounce in ordinal form "first" instead of "one" + + Returns: Number string + """ + if (number < 20): + if ordinals: + return _SYRIAC_ORDINAL_BASE[number] + return _SYRIAC_ONES[number] + + if (number < 100): + quotient, remainder = divmod(number, 10) + if remainder == 0: + if ordinals: + return _SYRIAC_ORDINAL_BASE[number] + return _SYRIAC_TENS[quotient] + if ordinals: + return _SYRIAC_TENS[quotient] + _SYRIAC_CONJOINER + _SYRIAC_ORDINAL_BASE[remainder] + return _SYRIAC_TENS[quotient] + _SYRIAC_CONJOINER + _SYRIAC_ONES[remainder] + + if (number > 1000): + quotient, remainder = divmod(number, 1000) + if remainder == 0: + return _SYRIAC_ORDINAL_BASE[number] + if ordinals: + return _SYRIAC_LARGE[quotient] + _SYRIAC_CONJOINER + _SYRIAC_ORDINAL_BASE[remainder] + return _SYRIAC_LARGE[quotient] + _SYRIAC_CONJOINER + _SYRIAC_HUNDREDS[remainder] + + quotient, remainder = divmod(number, 100) + + if remainder == 0: + if ordinals: + return _SYRIAC_ORDINAL_BASE[number] + return _SYRIAC_HUNDREDS[quotient] + + return _SYRIAC_HUNDREDS[quotient] + _SYRIAC_CONJOINER + _lookup_syriac_word(remainder) + +def _generate_whole_numbers(number, ordinals=False): + """ + Given a number, through subsequent passes of the _SYRIAC_LARGE list generate a number + string for each pass and then generate a final string. + + For example, 103254654 will generate the following strings per each pass: + + pass [] ܫܬܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ, result ܫܬܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ + pass [ܐܠܦܐ] ܬܪܝܢܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ ܐܠܦܐ, result ܬܪܝܢܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ ܐܠܦܐ ܘܫܬܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ + pass [ܡܠܝܘܢܐ] ܡܐܐ ܘܬܠܬܐ ܡܠܝܘܢܐ, result ܡܐܐ ܘܬܠܬܐ ܡܠܝܘܢܐ ܘܬܪܝܢܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ ܐܠܦܐ ܘܫܬܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ + + Args: + num(float or int): the number to pronounce (under 100) + ordinals (bool): pronounce in ordinal form "first" instead of "one" + + Returns: + (result): The final number string + """ + temp_number = number + result = '' + + for syriac_large_num in _SYRIAC_LARGE: + temp_number, remainder = divmod(temp_number, 1000) + if (remainder == 0): + continue + + if ordinals: + text = _lookup_syriac_word(number, ordinals) + else: + text = _lookup_syriac_word(remainder) + + if not ordinals: + if remainder == 1 and syriac_large_num == 'ܐܠܦܐ': + text = syriac_large_num + elif syriac_large_num != '': + if ordinals: + pass + else: + text += ' ' + syriac_large_num + + if not ordinals and len(result) > 1: + result = text + _SYRIAC_CONJOINER + result + else: + result = text + return result + +def _generate_fractional_numbers(number, _precision): + """ + Given a number, generate the whole number string + fractional string + + Returns: + (result): The final number string + """ + if (number / 10**_precision == 0.5): + return "ܦܠܓܐ" + + whole = _generate_whole_numbers(number) + quotient, remainder = divmod(_precision, 3) + + # String will either have part of the _SYRIAC_FRAC OR the _SYRIAC_FRAC_BIG list + fractional = _SYRIAC_SEPARATOR + _SYRIAC_FRAC[remainder] + _SYRIAC_FRAC_BIG[quotient] + + result = whole + fractional + return result + +def _generate_numbers_string(number, places, ordinals=False): + if number < 0: + return "ܣܚܘܦܐ " + _generate_numbers_string(-number, places) + + if (number == 0): + return "ܣܝܦܪ" + + whole, fractional, precision = _unpack_number_to_parts(number, places) + + if fractional == 0: + if ordinals: + return _generate_whole_numbers(whole, ordinals) + else: + return _generate_whole_numbers(whole) + if whole == 0: + return _generate_fractional_numbers(fractional, precision) + + result = _generate_whole_numbers(whole) + _SYRIAC_CONJOINER + _generate_fractional_numbers(fractional, precision) + return result + +def pronounce_number_syr(number, places=2, scientific=False, + ordinals=False, variant=None): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "ܠܐ ܡܬܚܡܐ" + elif num == float("-inf"): + return "ܣܚܘܦܐ ܠܐ ܡܬܚܡܐ" + if scientific: + if number == 0: + return "ܣܝܦܪ" + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + + if power != 0: + return '{}{} ܥܦܝܦ ܥܣܪܐ ܒܚܝܠܐ ܕ{}{}'.format( + 'ܣܚܘܦܐ ' if float(n) < 0 else '', + pronounce_number_syr( + abs(float(n)), places, False, ordinals=False), + 'ܣܚܘܦܐ ' if power < 0 else '', + pronounce_number_syr(abs(power), places, False, ordinals=False)) + if ordinals: + return _generate_numbers_string(number, places, ordinals=True) + + return _generate_numbers_string(number, places) + +def nice_time_syr(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_syr(int(string[1])) + else: + speak = pronounce_number_syr(int(string[0:2])) + if not string[3:5] == '00': + speak += " ܘ" + if string[3] == '0': + speak += pronounce_number_syr(int(string[4])) + else: + speak += pronounce_number_syr(int(string[3:5])) + speak += ' ܩܛܝܢܬ̈ܐ' + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "ܛܗܪ̈ܝ ܠܠܝܐ" + elif dt.hour == 12 and dt.minute == 0: + return "ܛܗܪܐ" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_number_syr(hour) + " ܘܪܘܒܥܐ" + elif dt.minute == 30: + speak = pronounce_number_syr(hour) + " ܘܦܠܓܐ" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "ܪܘܒܥܐ ܩܐ " + pronounce_number_syr(next_hour) + else: + speak = pronounce_number_syr(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + else: + speak += " ܘ" + pronounce_number_syr(dt.minute) + ' ܩܛܝܢܬ̈ܐ' + + if use_ampm: + if dt.hour > 11: + speak += " ܒܬܪ ܛܗܪܐ" + else: + speak += " ܩܕܡ ܛܗܪܐ" + + return speak + +def nice_relative_time_syr(when, relative_to=None, lang=None): + """Create a relative phrase to roughly describe a datetime + Examples are "25 seconds", "tomorrow", "7 days". + Args: + when (datetime): Local timezone + relative_to (datetime): Baseline for relative time, default is now() + lang (str, optional): Defaults to "en-us". + Returns: + str: Relative description of the given time + """ + if relative_to: + now = relative_to + else: + now = now_local() + delta = to_local(when) - now + + if delta.total_seconds() < 1: + return "ܗܫܐ" + + if delta.total_seconds() < 90: + if delta.total_seconds() == 1: + return "ܚܕ ܪܦܦܐ" + else: + return "{} ܪ̈ܦܦܐ".format(int(delta.total_seconds())) + + minutes = int((delta.total_seconds() + 30) // 60) # +30 to round minutes + if minutes < 90: + if minutes == 1: + return "ܚܕ ܩܛܝܢܬܐ" + else: + return "{} ܩܛܝܢܬ̈ܐ".format(minutes) + + hours = int((minutes + 30) // 60) # +30 to round hours + if hours < 36: + if hours == 1: + return "ܚܕ ܫܥܬܐ" + else: + return "{} ܫܥ̈ܐ".format(hours) + + # TODO: "2 weeks", "3 months", "4 years", etc + days = int((hours + 12) // 24) # +12 to round days + if days == 1: + return "ܚܕ ܝܘܡܐ" + else: + return "{} ܝܘܡܢ̈ܐ".format(days) + +def _singularize_syr(word): + """ + Normalize the word + + The character category "Mn" stands for Nonspacing_Mark and therefore will remove + combining characters + """ + return ''.join(char for char in unicodedata.normalize('NFD', word) + if unicodedata.category(char) != 'Mn') + +def _pluralize_syr(word): + + # The penultimate letter in the word usually receives the syameh (ܣܝܡ̈ܐ) unless + # there is letter ܪ in the word, independent of its place the syameh are written + # above the letter ܪ. + # + # If there are two or more letters ܪ in the word, then the syameh is written on + # the last letter ܪ. + + # If the word has a ܪ, then find the last occurrence of ܪ and place the syameh + # above it + if 'ܪ' in word: + index = word.rindex('ܪ') + word = word[:index] + 'ܪ̈' + word[index + 1:] + else: + penultimate_char = word[-2] + last_char = word[-1] + penultimate_char = penultimate_char + u'\u0308' + word = word[:-2] + penultimate_char + word[-1:] + + return word + +def get_plural_form_syr(word, amount): + """ + Get plural form of the specified word for the specified amount. + + Args: + word(str): Word to be pluralized. + amount(int or float or pair or list): The amount that is used to + determine the category. If type is range, it must contain + the start and end numbers. + type(str): Either cardinal (default), ordinal or range. + Returns: + (str): Pluralized word. + """ + if amount == 1: + return _singularize_syr(word) + return _pluralize_syr(word) diff --git a/lingua_franca/lang/parse_syr.py b/lingua_franca/lang/parse_syr.py new file mode 100644 index 00000000..cbdda47b --- /dev/null +++ b/lingua_franca/lang/parse_syr.py @@ -0,0 +1,539 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +from datetime import timedelta + +from lingua_franca.internal import resolve_resource_file +from lingua_franca.lang.common_data_syr import (_SYRIAC_ORDINAL_BASE, _SYRIAC_LARGE, + _SYRIAC_HUNDREDS, _SYRIAC_ONES, + _SYRIAC_ONES_FEM, _SYRIAC_TENS, + _SYRIAC_FRACTIONS, _SYRIAC_FRACTIONS_HALF, + _SYRIAC_SEPARATOR) +from lingua_franca.lang.parse_common import Normalizer +from lingua_franca.time import now_local + +def _is_number(s): + try: + float(s) + return True + except ValueError: + return False + +def _parse_sentence(text): + words = text.split() + result = [] + current_number = 0 + current_words = [] + sum_number = 0 + mode = 'init' + + def finish_num(): + nonlocal current_number + nonlocal sum_number + nonlocal result + nonlocal mode + nonlocal current_words + current_number += sum_number + if current_number != 0: + result.append((current_number, current_words)) + sum_number = 0 + current_number = 0 + current_words = [] + mode = 'init' + + for word in words: + + # Keep a copy of the word as we will modify it below + temp_word = word + + # If the first letter starts with ܘ then treat it specifically as a conjoining ܘ as in this + # context it is a conjoining letter and there is most likely a number following it + if word[0] == "ܘ": + word = word[1:] # Remove the ܘ to make the logic easier to follow + + if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': + mode += '_conjoiner' + elif mode == 'num': + pass + else: + finish_num() + + if word == "ܦܠܓܐ": + current_words.append(temp_word) + current_number += 0.5 + finish_num() + elif word in _SYRIAC_ONES or word in _SYRIAC_ONES_FEM: + if word in _SYRIAC_ONES: + temp_ones_number = _SYRIAC_ONES.index(word) + elif word in _SYRIAC_ONES_FEM: + temp_ones_number = _SYRIAC_ONES_FEM.index(word) + + if mode != 'init' and mode != 'num_hundred_conjoiner' and mode != 'num': + if not(temp_ones_number < 10 and mode == 'num_ten_conjoiner'): + finish_num() + current_words.append(temp_word) + sum_number += temp_ones_number + mode = 'num_one' + elif word in _SYRIAC_TENS: + if mode != 'init' and mode != 'num_hundred_conjoiner' and mode != 'num': + finish_num() + current_words.append(temp_word) + sum_number += _SYRIAC_TENS.index(word)*10 + mode = 'num_ten' + elif word in _SYRIAC_HUNDREDS: + if mode != 'init' and mode != 'num': + finish_num() + current_words.append(temp_word) + sum_number += _SYRIAC_HUNDREDS.index(word)*100 + mode = 'num_hundred' + elif word in _SYRIAC_LARGE: + current_words.append(temp_word) + temp_large_number = _SYRIAC_LARGE.index(word) + if mode == 'init' and temp_large_number == 1: + sum_number = 1 + sum_number *= 10**(3*temp_large_number) + current_number += sum_number + sum_number = 0 + mode = 'num' + elif word in list(_SYRIAC_ORDINAL_BASE.values()): + current_words.append(temp_word) + sum_number = list(_SYRIAC_ORDINAL_BASE.values()).index(word) + current_number = sum_number + sum_number = 1 + mode = 'num' + elif _is_number(word): + current_words.append(word) + current_number = float(word) + finish_num() + elif is_fractional_syr(word): + result = result + is_fractional_syr(word) + else: + finish_num() + result.append(word) + + if mode[:3] == 'num': + finish_num() + + return result + + +_time_units = { + 'ܪ̈ܦܦܐ': timedelta(seconds=1), + 'ܪܦܦܐ': timedelta(seconds=1), + 'ܩܛܝܢܬ̈ܐ': timedelta(minutes=1), + 'ܩܛܝܢܬܐ': timedelta(minutes=1), + 'ܩܛܝܢ̈ܐ': timedelta(minutes=1), + 'ܩܛܝܢܐ': timedelta(minutes=1), + 'ܕܩܝܩܬ̈ܐ': timedelta(minutes=1), + 'ܕܩܝܩܬܐ': timedelta(minutes=1), + 'ܕܩܝܩ̈ܐ': timedelta(minutes=1), + 'ܕܩܝܩܐ': timedelta(minutes=1), + 'ܫܥܬܐ': timedelta(hours=1), + 'ܫܥ̈ܐ': timedelta(hours=1), + 'ܣܥܬ': timedelta(hours=1), + 'ܣܥܬ̈ܐ': timedelta(hours=1), +} + +_date_units = { + 'ܝܘܡܢ̈ܐ': timedelta(days=1), + 'ܝܘܡܐ': timedelta(days=1), + 'ܫܒܘܥ̈ܐ': timedelta(weeks=1), + 'ܫܒܘܥܐ': timedelta(weeks=1), + 'ܫܒܬ̈ܐ': timedelta(weeks=1), + 'ܫܒܬܐ': timedelta(weeks=1), +} + +def extract_duration_syr(text): + """ + Convert a Syriac phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + remainder = [] + words = _parse_sentence(text) + current_number = None + result = timedelta(0) + for word in words: + if type(word) == tuple: + current_number = word + elif word in _time_units: + result += _time_units[word] * current_number[0] + current_number = None + elif word in _date_units: + result += _date_units[word] * current_number[0] + current_number = None + else: + if current_number: + remainder.extend(current_number[1]) + remainder.append(word) + current_number = None + return (result, " ".join(remainder)) + +def extract_datetime_syr(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + if text == "": + return None + text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ + .replace('؟', '').replace("ܝܘܡܐ ܐܚܪܢܐ", "ܝܘܡܐܐܚܪܢܐ") \ + .replace('؟', '').replace("ܩܘܕܡܐ ܕܥܪܝܪܗ", "ܩܘܕܡܐܕܥܪܝܪܗ") \ + .replace('؟', '').replace("ܝܘܡܐ ܕܐܬܐ", "ܝܘܡܐܕܐܬܐ") \ + .replace('؟', '').replace("ܩܘܕܡܐ ܕܐܬܐ", "ܩܘܕܡܐܕܐܬܐ") \ + .replace('؟', '').replace("ܩܕܡ ܛܗܪܐ", "ܩܕܡܛܗܪܐ") \ + .replace('؟', '').replace("ܒܬܪ ܛܗܪܐ", "ܒܬܪܛܗܪܐ") \ + .replace('؟', '').replace("ܒܬܪ ܟܘܬܪܐ", "ܒܬܪܟܘܬܪܐ") \ + .replace('ܬܪܝܢ ܒܫܒܐ', 'ܬܪܝܢܒܫܒܐ') \ + .replace('ܬܠܬܐ ܒܫܒܐ', 'ܬܠܬܒܫܒܐ') \ + .replace('ܐܪܒܥܐ ܒܫܒܐ', 'ܐܪܒܥܒܫܒܐ') \ + .replace('ܚܡܫܐ ܒܫܒܐ', 'ܚܡܫܒܫܒܐ') \ + .replace('ܚܕ ܒܫܒܐ', 'ܚܕܒܫܒܐ') \ + + if not anchorDate: + anchorDate = now_local() + + today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) + today_weekday = int(anchorDate.strftime("%w")) + weekday_names = [ + 'ܬܪܝܢܒܫܒܐ', + 'ܬܠܬܒܫܒܐ', + 'ܐܪܒܥܒܫܒܐ', + 'ܚܡܫܒܫܒܐ', + 'ܥܪܘܒܬܐ', + 'ܫܒܬܐ', + 'ܚܕܒܫܒܐ', + ] + daysDict = { + 'ܬܡܠ': today + timedelta(days= -2), + 'ܬܡܠ': today + timedelta(days= -1), + 'ܩܘܕܡܐܕܥܪܝܪܗ': today + timedelta(days= -2), + 'ܬܡܠ': today + timedelta(days= -1), + 'ܐܕܝܘܡ': today, + 'ܝܘܡܐܕܐܬܐ': today + timedelta(days= 1), + 'ܩܘܕܡܐܕܐܬܐ': today + timedelta(days= 1), + 'ܝܘܡܐܐܚܪܢܐ': today + timedelta(days= 2), + } + timesDict = { + 'ܩܕܡܛܗܪܐ': timedelta(hours=8), + 'ܩܕܡܬܐ': timedelta(hours=8), + 'ܒܬܪܛܗܪܐ': timedelta(hours=15), + 'ܒܬܪܟܘܬܪܐ': timedelta(hours=15), + } + + exactDict = { + 'ܗܫܐ': anchorDate, + } + nextWords = ["ܒܬܪ", "ܡܢ ܒܬܪ", "ܒܬܪ ܗܕܐ", "ܒܬܪܝܐ"] + prevWords = ["ܩܕܝܡܐܝܬ", "ܡܩܕܡ ܕ", "ܩܕܡ", "ܡܢ ܩܕܡ", "ܩܘܕܡܐܝܬ", "ܩܕܡ ܐܕܝܐ"] + words = _parse_sentence(text) + mode = 'none' + number_seen = None + delta_seen = timedelta(0) + remainder = [] + result = None + for word in words: + handled = 1 + + if mode == 'finished': + pass + + if type(word) == tuple: + number_seen = word + elif word in weekday_names: + dayOffset = (weekday_names.index(word) + 1) - today_weekday + if dayOffset < 0: + dayOffset += 7 + result = today + timedelta(days=dayOffset) + mode = 'time' + elif word in exactDict: + result = exactDict[word] + mode = 'finished' + elif word in daysDict: + result = daysDict[word] + mode = 'time' + elif word in timesDict and mode == 'time': + result += timesDict[word] + mode = 'finished' + elif word in _date_units: + k = 1 + if number_seen: + k = number_seen[0] + number_seen = None + delta_seen += _date_units[word] * k + if mode != 'delta_time': + mode = 'delta_date' + elif word in _time_units: + k = 1 + if number_seen: + k = number_seen[0] + number_seen = None + delta_seen += _time_units[word] * k + mode = 'delta_time' + elif word in nextWords or word in prevWords: + # Give up instead of incorrect result + if mode == 'time': + return None + sign = 1 if word in nextWords else -1 + if mode == 'delta_date': + result = today + delta_seen + mode = 'time' + elif mode == 'delta_time': + result = anchorDate + delta_seen + mode = 'finished' + else: + handled = 0 + else: + handled = 0 + + + if mode == 'delta_date': + result = today + delta_seen + mode = 'delta_time' + elif mode == 'delta_time': + result = anchorDate + delta_seen + mode = 'finished' + + if handled == 1: + continue + if number_seen: + remainder.extend(number_seen[1]) + number_seen = None + if result == None: + result = anchorDate + + remainder.append(word) + + return (result, " ".join(remainder)) + +def is_fractional_syr(text): + """ + This function takes the given text and checks if it is a fraction. + + Args: + text (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + + def partition_text (text): + """ + This function takes text, partitions and cleans it + + Args: + text (str): the string to partition + Returns: + (dict) or (bool): False if it does not have the separator, ܡܢ, + otherwise return the dict + + """ + dict_partition = [] + + # [0] is word before the separator + # [1] is the separator, ܡܢ + # [2] is the word after the separator + parted_text = text.partition(_SYRIAC_SEPARATOR) + + # Numerator and denominator must exist + if len(parted_text[0]) != 0 and len(parted_text[2]) != 0: + # If it does not have ܡܢ then this is not a fraction + if parted_text[1] != _SYRIAC_SEPARATOR: + return False + + for part in parted_text: + # Remove whitespace + part.replace(' ', '') + + dict_partition = { + 'numerator' : parted_text[0], + 'denominator' : parted_text[2] + } + else: + return False + + return dict_partition + + # Exception for half or ܦܠܓܐ + if text in _SYRIAC_FRACTIONS_HALF: + fraction = 0.5 + return fraction + + # Check to see if the word is in the list + if text in list(_SYRIAC_FRACTIONS.values()): + # Find the key and use that as the denominator + denominator = [key for key, value in _SYRIAC_FRACTIONS.items() if value == text] + # Turn the returned list to an int + denominator = int(' '.join([str(elem) for elem in denominator])) + + fraction = 1/denominator + + return fraction + # Otherwise, it will be in the form of [denominator ܡܢ numerator] or ܬܠܬܐ ܡܢ ܥܣܪܐ + else: + + if partition_text(text): + # Just retrieve the dictionary containing the numerator and denominator + dict_partition = partition_text(text) + for fract_part, text in dict_partition.items(): + + if text in _SYRIAC_ONES or text in _SYRIAC_ONES_FEM: + if text in _SYRIAC_ONES: + temp = _SYRIAC_ONES.index(text) + elif text in _SYRIAC_ONES_FEM: + temp = _SYRIAC_ONES_FEM.index(text) + elif text in _SYRIAC_TENS: + temp = _SYRIAC_TENS.index(text)*10 + elif text in _SYRIAC_HUNDREDS: + temp = _SYRIAC_HUNDREDS.index(text)*100 + elif text in _SYRIAC_LARGE: + if _SYRIAC_LARGE.index(text) == 1: + temp = 1 + temp *= 10**(3*_SYRIAC_LARGE.index(text)) + else: + return False + + if fract_part == 'numerator': + numerator = temp + else: + denominator = temp + + fraction = numerator/denominator + return fraction + #return False + else: + return False + + return False + +def get_gender_syr(word, context=""): + """ Guess the gender of a word + + Some languages assign genders to specific words. This method will attempt + to determine the gender, optionally using the provided context sentence. + + Args: + word (str): The word to look up + context (str, optional): String containing word, for context + + Returns: + str: The code "m" (male), "f" (female) or "n" (neutral) for the gender, + or None if unknown/or unused in the given language. + """ + word = word.rstrip("s") + gender = False + words = context.split(" ") + for idx, w in enumerate(words): + if w == word and idx != 0: + previous = words[idx - 1] + gender = get_gender_syr(previous) + break + if not gender: + if word[-1] == "a": + gender = "f" + if word[-1] == "o" or word[-1] == "e": + gender = "m" + return gender + +def extract_numbers_syr(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + + words = _parse_sentence(text) + result = [] + for word in words: + if type(word) == tuple: + result.append(word[0]) + return result + + +def extract_number_syr(text, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + word = extract_numbers_syr(text, ordinals=ordinals) + if (len(word) == 0): + return False + return word[0] diff --git a/lingua_franca/res/text/syr-sy/and.word b/lingua_franca/res/text/syr-sy/and.word new file mode 100644 index 00000000..d2836d57 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/and.word @@ -0,0 +1 @@ +ܘ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/date_time.json b/lingua_franca/res/text/syr-sy/date_time.json new file mode 100644 index 00000000..19ec82c1 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/date_time.json @@ -0,0 +1,180 @@ +{ + "decade_format": { + "1": { + "match": "^\\d$", + "format": "{x}" + }, + "2": { + "match": "^1\\d$", + "format": "{xx}" + }, + "3": { + "match": "^\\d0$", + "format": "{x0}" + }, + "4": { + "match": "^[2-9]\\d$", + "format": "{x0} {x}" + }, + "default": "{number}" + }, + "hundreds_format": { + "1": { + "match": "^\\d{3}$", + "format": "{x_in_x00} hundred" + }, + "default": "{number}" + }, + "thousand_format": { + "1": { + "match": "^\\d00\\d$", + "format": "{x_in_x000} thousand" + }, + "2": { + "match": "^1\\d00$", + "format": "{xx_in_xx00} hundred" + }, + "3": { + "match": "^\\d{2}00$", + "format": "{x0_in_x000} {x_in_x00} hundred" + }, + "4": { + "match": "^(1\\d{3})|(\\d0\\d{2})$", + "format": "{xx_in_xx00}" + }, + "5": { + "match": "^\\d{4}$", + "format": "{x0_in_x000} {x_in_x00}" + }, + "default": "{number}" + }, + "year_format": { + "1": { + "match": "^\\d\\d?$", + "format": "{formatted_decade} {bc}" + }, + "2": { + "match": "^\\d00$", + "format": "{formatted_hundreds} {bc}" + }, + "3": { + "match": "^\\d{3}$", + "format": "{formatted_hundreds} {formatted_decade} {bc}" + }, + "4": { + "match": "^\\d{2}00$", + "format": "{formatted_thousand} {bc}" + }, + "5": { + "match": "^\\d00\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "6": { + "match": "^\\d{2}0\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "7": { + "match": "^\\d{4}$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "default": "{year} {bc}", + "bc": "ܩܕܡ ܡܫܝܚܐ" + }, + "date_format": { + "date_full": "{weekday}، {day} {month}، {formatted_year}", + "date_full_no_year": "{weekday}، {day} {month}", + "date_full_no_year_month": "{weekday}، {day}", + "today": "ܐܕܝܘܡ", + "tomorrow": "ܝܘܡܐ ܕܐܬܐ", + "yesterday": "ܬܡܠ" + }, + "date_time_format": { + "date_time": "{formatted_date} {formatted_time}" + }, + "weekday": { + "0": "ܬܪܝܢܒܫܒܐ", + "1": "ܬܠܬܒܫܒܐ", + "2": "ܐܪܒܥܒܫܒܐ", + "3": "ܚܡܫܒܫܒܐ", + "4": "ܥܪܘܒܬܐ", + "5": "ܫܒܬܐ", + "6": "ܚܕܒܫܒܐ" + }, + "date": { + "1": "ܩܕܡܝܐ", + "2": "ܬܪܝܢܐ", + "3": "ܬܠܝܬܝܐ", + "4": "ܪܒܝܥܝܐ", + "5": "ܚܡܝܫܝܐ", + "6": "ܫܬܝܬܝܐ", + "7": "ܫܒܝܥܝܐ", + "8": "ܬܡܝܢܝܐ", + "9": "ܬܫܝܥܝܐ", + "10": "ܥܣܝܪܝܐ", + "11": "ܚܕܥܣܝܪܝܐ", + "12": "ܬܪܥܣܝܪܝܐ", + "13": "ܬܠܬܥܣܝܪܝܐ", + "14": "ܐܪܒܥܣܝܪܝܐ", + "15": "ܚܡܫܥܣܝܪܝܐ", + "16": "ܫܬܥܣܝܪܝܐ", + "17": "ܫܒܥܣܝܪܝܐ", + "18": "ܬܡܢܥܣܝܪܝܐ", + "19": "ܬܫܥܣܝܪܝܐ", + "20": "ܥܣܪܝܢܝܐ", + "21": "ܥܣܪܝܢ ܘܩܕܡܝܐ", + "22": "ܥܣܪܝܢ ܘܬܪܝܢܐ", + "23": "ܥܣܪܝܢ ܘܬܠܝܬܝܐ", + "24": "ܥܣܪܝܢ ܘܪܒܝܥܝܐ", + "25": "ܥܣܪܝܢ ܘܚܡܝܫܝܐ", + "26": "ܥܣܪܝܢ ܘܫܬܝܬܝܐ", + "27": "ܥܣܪܝܢ ܘܫܒܝܥܝܐ", + "28": "ܥܣܪܝܢ ܘܬܡܝܢܝܐ", + "29": "ܥܣܪܝܢ ܘܬܫܝܥܝܐ", + "30": "ܬܠܬܝܢܝܐ", + "31": "ܬܠܬܝܢ ܘܩܕܡܝܐ" + }, + "month": { + "1": "ܟܢܘܢ ܐܚܪܝܐ", + "2": "ܫܒܛ", + "3": "ܐܕܪ", + "4": "ܢܝܣܢ", + "5": "ܐܝܪ", + "6": "ܚܙܝܪܢ", + "7": "ܬܡܘܙ", + "8": "ܐܒ", + "9": "ܐܝܠܘܠ", + "10": "ܬܫܪܝܢ ܩܕܡܝܐ", + "11": "ܬܫܪܝܢ ܐܚܪܝܐ", + "12": "ܟܢܘܢ ܩܕܡܝܐ" + }, + "number": { + "0": "ܣܝܦܪ", + "1": "ܚܕ", + "2": "ܬܪܝܢ", + "3": "ܬܠܬܐ", + "4": "ܐܪܒܥܐ", + "5": "ܚܡܫܐ", + "6": "ܫܬܐ", + "7": "ܫܒܥܐ", + "8": "ܬܡܬܡܢܝܐܢܝܐ", + "9": "ܬܫܥܐ", + "10": "ܥܣܪܐ", + "11": "ܚܕܥܣܪ", + "12": "ܬܪܥܣܪ", + "13": "ܬܠܬܥܣܪ", + "14": "ܐܪܒܥܣܪ", + "15": "ܚܡܫܥܣܪ", + "16": "ܫܬܥܣܪ", + "17": "ܫܒܥܣܪ", + "18": "ܬܡܢܥܣܪ", + "19": "ܬܫܥܣܪ", + "20": "ܥܣܪܝܢ", + "30": "ܬܠܬܝܢ", + "40": "ܐܪܒܥܝܢ", + "50": "ܚܡܫܝܢ", + "60": "ܫܬܝܢ", + "70": "ܫܒܥܝܢ", + "80": "ܬܡܢܝܢ", + "90": "ܬܫܥܝܢ" + } +} diff --git a/lingua_franca/res/text/syr-sy/date_time_test.json b/lingua_franca/res/text/syr-sy/date_time_test.json new file mode 100644 index 00000000..a1561805 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/date_time_test.json @@ -0,0 +1,36 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ܚܕ ܩܕܡ ܡܫܝܚܐ" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ܥܣܪܐ ܩܕܡ ܡܫܝܚܐ" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ܐܠܦܐ ܘܬܪܥܣܪ" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ܐܠܦܐ ܘܐܪܒܥܝܢ ܘܫܬܐ" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܐܠܦܐ ܘܬܡܢܡܐܐ ܘܫܒܥܐ" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܐܠܦܐ ܘܫܒܥܡܐܐ ܘܫܒܥܣܪ" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܐܠܦܐ ܘܬܫܥܡܐܐ ܘܬܡܢܝܢ ܘܬܡܢܝܐ"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ ܘܬܫܥܐ"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ ܘܬܡܢܥܣܪ"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ ܘܥܣܪܝܢ ܘܚܕ"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ ܘܬܠܬܝܢ"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ ܘܡܐܐ" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܐܠܦܐ" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "ܬܪܝܢ ܐܠܦ̈ܐ" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ܬܠܬܐ ܐܠܦ̈ܐ ܘܡܐܐ ܘܥܣܪܝܢ ܩܕܡ ܡܫܝܚܐ" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ܬܠܬܐ ܐܠܦ̈ܐ ܘܬܪܝܢܡܐܐ ܘܐܪܒܥܝܢ ܘܚܕ ܩܕܡ ܡܫܝܚܐ" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ܚܡܫܐ ܐܠܦ̈ܐ ܘܬܪܝܢܡܐܐ" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "ܬܠܬܒܫܒܐ، ܬܠܬܝܢ ܘܩܕܡܝܐ ܟܢܘܢ ܐܚܪܝܐ، ܥܣܪܝܢ ܫܒܥܣܪ"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "ܚܕܒܫܒܐ، ܪܒܝܥܝܐ ܫܒܛ، ܥܣܪܝܢ ܬܡܢܥܣܪ"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "ܚܕܒܫܒܐ، ܪܒܝܥܝܐ ܫܒܛ"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "ܚܕܒܫܒܐ، ܪܒܝܥܝܐ"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "ܝܘܡܐ ܕܐܬܐ"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "ܐܕܝܘܡ"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "ܬܡܠ"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "ܚܕܒܫܒܐ، ܪܒܝܥܝܐ ܫܒܛ"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "ܚܕܒܫܒܐ، ܪܒܝܥܝܐ ܫܒܛ، ܥܣܪܝܢ ܬܡܢܥܣܪ"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "ܬܠܬܒܫܒܐ، ܬܠܬܝܢ ܘܩܕܡܝܐ ܟܢܘܢ ܐܚܪܝܐ، ܥܣܪܝܢ ܫܒܥܣܪ ܚܕ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ ܒܬܪ ܛܗܪܐ"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "ܬܠܬܒܫܒܐ، ܬܠܬܝܢ ܘܩܕܡܝܐ ܟܢܘܢ ܐܚܪܝܐ، ܥܣܪܝܢ ܫܒܥܣܪ ܬܠܬܥܣܪ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ"} + } +} diff --git a/lingua_franca/res/text/syr-sy/day.word b/lingua_franca/res/text/syr-sy/day.word new file mode 100644 index 00000000..9f01075f --- /dev/null +++ b/lingua_franca/res/text/syr-sy/day.word @@ -0,0 +1 @@ +ܝܘܡܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/days.word b/lingua_franca/res/text/syr-sy/days.word new file mode 100644 index 00000000..d6f75f89 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/days.word @@ -0,0 +1 @@ +ܝܘܡܢ̈ܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/hour.word b/lingua_franca/res/text/syr-sy/hour.word new file mode 100644 index 00000000..b0d9f13a --- /dev/null +++ b/lingua_franca/res/text/syr-sy/hour.word @@ -0,0 +1,3 @@ +ܫܥܬܐ +ܫܥܐ +ܣܥܬ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/hours.word b/lingua_franca/res/text/syr-sy/hours.word new file mode 100644 index 00000000..848c7123 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/hours.word @@ -0,0 +1,3 @@ +ܫܥ̈ܐ +ܫܥܬ̈ܐ +ܣܥܬ̈ܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/minute.word b/lingua_franca/res/text/syr-sy/minute.word new file mode 100644 index 00000000..65498f1b --- /dev/null +++ b/lingua_franca/res/text/syr-sy/minute.word @@ -0,0 +1,4 @@ +ܩܛܝܢܐ +ܩܛܝܢܬܐ +ܕܩܝܩܬܐ +ܕܩܝܩܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/minutes.word b/lingua_franca/res/text/syr-sy/minutes.word new file mode 100644 index 00000000..f8884c06 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/minutes.word @@ -0,0 +1,4 @@ +ܩܛܝܢܬ̈ܐ +ܩܛܝܢ̈ܐ +ܕܩܝܩܬ̈ܐ +ܕܩܝܩ̈ܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/or.word b/lingua_franca/res/text/syr-sy/or.word new file mode 100644 index 00000000..5e22fb72 --- /dev/null +++ b/lingua_franca/res/text/syr-sy/or.word @@ -0,0 +1 @@ +ܐܘ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/second.word b/lingua_franca/res/text/syr-sy/second.word new file mode 100644 index 00000000..9e92468b --- /dev/null +++ b/lingua_franca/res/text/syr-sy/second.word @@ -0,0 +1 @@ +ܪܦܦܐ \ No newline at end of file diff --git a/lingua_franca/res/text/syr-sy/seconds.word b/lingua_franca/res/text/syr-sy/seconds.word new file mode 100644 index 00000000..ba36073d --- /dev/null +++ b/lingua_franca/res/text/syr-sy/seconds.word @@ -0,0 +1 @@ +ܪ̈ܦܦܐ \ No newline at end of file diff --git a/test/test_format_syr.py b/test/test_format_syr.py new file mode 100644 index 00000000..3c7d21b2 --- /dev/null +++ b/test/test_format_syr.py @@ -0,0 +1,364 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import unittest +import datetime +import ast +import warnings +import sys +from pathlib import Path + +# TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES, +# or make it public somehow +from lingua_franca import load_languages, unload_languages, set_default_lang, \ + get_primary_lang_code, get_active_langs, get_supported_langs +from lingua_franca.internal import UnsupportedLanguageError +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import nice_date +from lingua_franca.format import nice_date_time +from lingua_franca.format import nice_year +from lingua_franca.format import pronounce_number +from lingua_franca.format import date_time_format +from lingua_franca.format import join_list +from lingua_franca.lang.format_syr import get_plural_form_syr +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_languages(get_supported_langs()) + set_default_lang('syr-sy') + +def tearDownModule(): + unload_languages(get_active_langs()) + + +NUMBERS_FIXTURE_EN = { + 1.435634: '1.436', + 2: '2', + 5.0: '5', + 0.027: '0.027', + 0.25: 'ܚܕ ܡܢ ܐܪܒܥܐ', + 0.3: 'ܬܠܬܐ ܡܢ ܥܣܪܐ', + 0.5: 'ܦܠܓܐ', + 0.75: 'ܬܠܬܐ ܡܢ ܐܪܒܥܐ', + 1.333: '1 ܘܚܕ ܡܢ ܬܠܬܐ', + 2.666: '2 ܘܬܪܝܢ ܡܢ ܬܠܬܐ', + 1.25: '1 ܘܚܕ ܡܢ ܐܪܒܥܐ', + 1.75: '1 ܘܬܠܬܐ ܡܢ ܐܪܒܥܐ', + 3.4: '3 ܘܬܪܝܢ ܡܢ ܚܡܫܐ', + 16.8333: '16 ܘܚܡܫܐ ܡܢ ܫܬܐ', + 12.5714: '12 ܘܐܪܒܥܐ ܡܢ ܫܒܥܐ', + 9.625: '9 ܘܚܡܫܐ ܡܢ ܬܡܢܝܐ', + 6.777: '6 ܘܫܒܥܐ ܡܢ ܬܫܥܐ', + 3.1: '3 ܘܚܕ ܡܢ ܥܣܪܐ', + 2.272: '2 ܘܬܠܬܐ ܡܢ ܚܕܥܣܪ', + 5.583: '5 ܘܫܒܥܐ ܡܢ ܬܪܥܣܪ', + 8.384: '8 ܘܚܡܫܐ ܡܢ ܬܠܬܥܣܪ', + 0.071: 'ܚܕ ܡܢ ܐܪܒܥܣܪ', + 6.466: '6 ܘܫܒܥܐ ܡܢ ܚܡܫܥܣܪ', + 8.312: '8 ܘܚܡܫܐ ܡܢ ܫܬܥܣܪ', + 2.176: '2 ܘܬܠܬܐ ܡܢ ܫܒܥܣܪ', + 200.722: '200 ܘܬܠܬܥܣܪ ܡܢ ܬܡܢܥܣܪ', + 7.421: '7 ܘܬܡܢܝܐ ܡܢ ܬܫܥܣܪ', + 0.05: 'ܚܕ ܡܢ ܥܣܪܝܢ' +} + + +class TestNiceNumberFormat(unittest.TestCase): + + tmp_var = None + + def set_tmp_var(self, val): + self.tmp_var = val + + def test_convert_float_to_nice_number(self): + for number, number_str in NUMBERS_FIXTURE_EN.items(): + self.assertEqual(nice_number(number), number_str, + 'should format {} as {} and not {}'.format( + number, number_str, nice_number(number))) + + def test_specify_denominator(self): + self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), + '5 ܘܦܠܓܐ', + 'should format 5.5 as 5 and a half not {}'.format( + nice_number(5.5, denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, denominators=[1, 2]), + '2.333', + 'should format 2.333 as 2.333 not {}'.format( + nice_number(2.333, denominators=[1, 2]))) + + def test_no_speech(self): + self.assertEqual(nice_number(12.421, speech=False), + '12 8/19', + 'should format 12.421 as 12 8/19 not {}'.format( + nice_number(12.421, speech=False))) + self.assertEqual(nice_number(6.777, speech=False), + '6 7/9', + 'should format 6.777 as 6 7/9 not {}'.format( + nice_number(6.777, speech=False))) + self.assertEqual(nice_number(6.0, speech=False), + '6', + 'should format 6.0 as 6 not {}'.format( + nice_number(6.0, speech=False))) + + +class TestPronounceNumber(unittest.TestCase): + def test_convert_int(self): + self.assertEqual(pronounce_number(0), "ܣܝܦܪ") + self.assertEqual(pronounce_number(1), "ܚܕ") + self.assertEqual(pronounce_number(10), "ܥܣܪܐ") + self.assertEqual(pronounce_number(15), "ܚܡܫܥܣܪ") + self.assertEqual(pronounce_number(20), "ܥܣܪܝܢ") + self.assertEqual(pronounce_number(27), "ܥܣܪܝܢ ܘܫܒܥܐ") + self.assertEqual(pronounce_number(30), "ܬܠܬܝܢ") + self.assertEqual(pronounce_number(33), "ܬܠܬܝܢ ܘܬܠܬܐ") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1), "ܣܚܘܦܐ ܚܕ") + self.assertEqual(pronounce_number(-10), "ܣܚܘܦܐ ܥܣܪܐ") + self.assertEqual(pronounce_number(-15), "ܣܚܘܦܐ ܚܡܫܥܣܪ") + self.assertEqual(pronounce_number(-20), "ܣܚܘܦܐ ܥܣܪܝܢ") + self.assertEqual(pronounce_number(-27), "ܣܚܘܦܐ ܥܣܪܝܢ ܘܫܒܥܐ") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number(0.05), "ܚܡܫܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(-0.05), "ܣܚܘܦܐ ܚܡܫܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(1.234), + "ܚܕ ܘܥܣܪܝܢ ܘܬܠܬܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(21.234), + "ܥܣܪܝܢ ܘܚܕ ܘܥܣܪܝܢ ܘܬܠܬܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(21.234, places=1), + "ܥܣܪܝܢ ܘܚܕ ܘܬܪܝܢ ܡܢ ܥܣܪܐ") + self.assertEqual(pronounce_number(21.234, places=0), + "ܥܣܪܝܢ ܘܚܕ") + self.assertEqual(pronounce_number(21.234, places=3), + "ܥܣܪܝܢ ܘܚܕ ܘܬܪܝܢܡܐܐ ܘܬܠܬܝܢ ܘܐܪܒܥܐ ܡܢ ܐܠܦܐ") + self.assertEqual(pronounce_number(21.234, places=4), + "ܥܣܪܝܢ ܘܚܕ ܘܬܪܝܢܡܐܐ ܘܬܠܬܝܢ ܘܐܪܒܥܐ ܡܢ ܐܠܦܐ") + self.assertEqual(pronounce_number(21.234, places=5), + "ܥܣܪܝܢ ܘܚܕ ܘܬܪܝܢܡܐܐ ܘܬܠܬܝܢ ܘܐܪܒܥܐ ܡܢ ܐܠܦܐ") + self.assertEqual(pronounce_number(-1.234), + "ܣܚܘܦܐ ܚܕ ܘܥܣܪܝܢ ܘܬܠܬܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(-21.234), + "ܣܚܘܦܐ ܥܣܪܝܢ ܘܚܕ ܘܥܣܪܝܢ ܘܬܠܬܐ ܡܢ ܡܐܐ") + self.assertEqual(pronounce_number(-21.234, places=1), + "ܣܚܘܦܐ ܥܣܪܝܢ ܘܚܕ ܘܬܪܝܢ ܡܢ ܥܣܪܐ") + + def test_convert_hundreds(self): + self.assertEqual(pronounce_number(100), "ܡܐܐ") + self.assertEqual(pronounce_number(666), "ܫܬܡܐܐ ܘܫܬܝܢ ܘܫܬܐ") + self.assertEqual(pronounce_number(1456), "ܐܠܦܐ ܘܐܪܒܥܡܐܐ ܘܚܡܫܝܢ ܘܫܬܐ") + self.assertEqual(pronounce_number(1567), "ܐܠܦܐ ܘܚܡܫܡܐܐ ܘܫܬܝܢ ܘܫܒܥܐ") + self.assertEqual(pronounce_number(3456), "ܬܠܬܐ ܐܠܦܐ ܘܐܪܒܥܡܐܐ ܘܚܡܫܝܢ ܘܫܬܐ") + self.assertEqual(pronounce_number(18691), "ܬܡܢܥܣܪ ܐܠܦܐ ܘܫܬܡܐܐ ܘܬܫܥܝܢ ܘܚܕ") + self.assertEqual(pronounce_number(103254654), + "ܡܐܐ ܘܬܠܬܐ ܡܠܝܘܢܐ ܘܬܪܝܢܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ ܐܠܦܐ ܘܫܬܡܐܐ ܘܚܡܫܝܢ ܘܐܪܒܥܐ") + self.assertEqual(pronounce_number(1512457), "ܚܕ ܡܠܝܘܢܐ ܘܚܡܫܡܐܐ ܘܬܪܥܣܪ ܐܠܦܐ ܘܐܪܒܥܡܐܐ ܘܚܡܫܝܢ ܘܫܒܥܐ") + self.assertEqual(pronounce_number(209996), "ܬܪܝܢܡܐܐ ܘܬܫܥܐ ܐܠܦܐ ܘܬܫܥܡܐܐ ܘܬܫܥܝܢ ܘܫܬܐ") + + def test_convert_scientific_notation(self): + self.assertEqual(pronounce_number(0, scientific=True), "ܣܝܦܪ") + self.assertEqual(pronounce_number(33, scientific=True), + "ܬܠܬܐ ܘܬܠܬܐ ܡܢ ܥܣܪܐ ܥܦܝܦ ܥܣܪܐ ܒܚܝܠܐ ܕܚܕ") + self.assertEqual(pronounce_number(299792458, scientific=True), + "ܬܪܝܢ ܘܬܫܥܝܢ ܘܬܫܥܐ ܡܢ ܡܐܐ ܥܦܝܦ ܥܣܪܐ ܒܚܝܠܐ ܕܬܡܢܝܐ") + + def test_ordinals(self): + self.assertEqual(pronounce_number(1, ordinals=True), "ܩܕܡܝܐ") + self.assertEqual(pronounce_number(10, ordinals=True), "ܥܣܝܪܝܐ") + self.assertEqual(pronounce_number(15, ordinals=True), "ܚܡܫܥܣܝܪܝܐ") + self.assertEqual(pronounce_number(20, ordinals=True), "ܥܣܪܝܢܝܐ") + self.assertEqual(pronounce_number(27, ordinals=True), "ܥܣܪܝܢ ܘܫܒܝܥܝܐ") + self.assertEqual(pronounce_number(30, ordinals=True), "ܬܠܬܝܢܝܐ") + self.assertEqual(pronounce_number(33, ordinals=True), "ܬܠܬܝܢ ܘܬܠܝܬܝܐ") + self.assertEqual(pronounce_number(55, ordinals=True), "ܚܡܫܝܢ ܘܚܡܝܫܝܐ") + self.assertEqual(pronounce_number(100, ordinals=True), "ܐܡܝܐ") + self.assertEqual(pronounce_number(1000, ordinals=True), "ܐܠܦܝܐ") + self.assertEqual(pronounce_number(1500, ordinals=True), "ܐܠܦܐ ܘܚܡܫܡܝܐ") + self.assertEqual(pronounce_number(10000, ordinals=True), "ܪܒܘܬܢܝܐ") + + +class TestNiceDateFormat(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Read date_time_test.json files for test data + cls.test_config = {} + p = Path(date_time_format.config_path) + for sub_dir in [x for x in p.iterdir() if x.is_dir()]: + if (sub_dir / 'date_time_test.json').exists(): + print("Getting test for " + + str(sub_dir / 'date_time_test.json')) + with (sub_dir / 'date_time_test.json').open() as f: + cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) + + + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3, tzinfo=default_timezone()) + + # Verify defaults haven't changed + self.assertEqual(nice_time(dt), + nice_time(dt, "syr-sy", True, False, False)) + + self.assertEqual(nice_time(dt), + "ܚܕ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܚܕ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ ܒܬܪ ܛܗܪܐ") + self.assertEqual(nice_time(dt, speech=False), + "1:22") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:22 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:22") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "ܬܠܬܥܣܪ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "ܬܠܬܥܣܪ ܘܥܣܪܝܢ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܚܕ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܚܕ ܒܬܪ ܛܗܪܐ") + self.assertEqual(nice_time(dt, speech=False), + "1:00") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:00 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:00") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "ܬܠܬܥܣܪ") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "ܬܠܬܥܣܪ") + + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ ܒܬܪ ܛܗܪܐ") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:02 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "ܬܠܬܥܣܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "ܬܠܬܥܣܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܬܪܥܣܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܬܪܥܣܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ ܩܕܡ ܛܗܪܐ") + self.assertEqual(nice_time(dt, speech=False), + "12:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "12:02 AM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "00:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "00:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "ܣܝܦܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "ܣܝܦܪ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + + dt = datetime.datetime(2018, 2, 8, + 1, 2, 33, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ ܩܕܡ ܛܗܪܐ") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:02 AM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "01:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "01:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "ܚܕ ܘܬܪܝܢ ܩܛܝܢܬ̈ܐ") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܬܪܥܣܪ ܘܪܘܒܥܐ") + self.assertEqual(nice_time(dt, use_ampm=True), + "ܬܪܥܣܪ ܘܪܘܒܥܐ ܒܬܪ ܛܗܪܐ") + + dt = datetime.datetime(2017, 1, 31, + 5, 30, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_ampm=True), + "ܚܡܫܐ ܘܦܠܓܐ ܩܕܡ ܛܗܪܐ") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ܪܘܒܥܐ ܩܐ ܬܪܝܢ") + + def test_join(self): + self.assertEqual(join_list(None, "and"), "") + self.assertEqual(join_list([], "and"), "") + + self.assertEqual(join_list(["ܐ"], "ܘ"), "ܐ") + self.assertEqual(join_list(["ܐ", "ܒ"], "ܘ"), "ܐ ܘ ܒ") + self.assertEqual(join_list(["ܐ", "ܒ"], "ܐܘ"), "ܐ ܐܘ ܒ") + + self.assertEqual(join_list(["ܐ", "ܒ", "ܓ"], "ܘ"), "ܐ, ܒ ܘ ܓ") + self.assertEqual(join_list(["ܐ", "ܒ", "ܓ"], "ܐܘ"), "ܐ, ܒ ܐܘ ܓ") + self.assertEqual(join_list(["ܐ", "ܒ", "ܓ"], "ܐܘ", "؛"), "ܐ؛ ܒ ܐܘ ܓ") + self.assertEqual(join_list(["ܐ", "ܒ", "ܓ", "ܕ"], "ܐܘ"), "ܐ, ܒ, ܓ ܐܘ ܕ") + + self.assertEqual(join_list([1, "ܒ", 3, "ܕ"], "ܐܘ"), "1, ܒ, 3 ܐܘ ܕ") + +class TestPluralForms(unittest.TestCase): + def test_pluralize(self): + self.assertEqual(get_plural_form_syr("ܫܪܪܐ", 1), "ܫܪܪܐ") + self.assertEqual(get_plural_form_syr("ܫܪܪܐ", 2), "ܫܪܪ̈ܐ") # Pluralize + self.assertEqual(get_plural_form_syr("ܫܪܪܬܐ", 1), "ܫܪܪܬܐ") + self.assertEqual(get_plural_form_syr("ܫܪܪܬܐ", 2), "ܫܪܪ̈ܬܐ") # Pluralize + self.assertEqual(get_plural_form_syr("ܒܝܬܐ", 1), "ܒܝܬܐ") + self.assertEqual(get_plural_form_syr("ܒܝܬܐ", 2), "ܒܝܬ̈ܐ") # Pluralize + self.assertEqual(get_plural_form_syr("ܝܠܘܦܐ", 2), "ܝܠܘܦ̈ܐ") # Pluralize + self.assertEqual(get_plural_form_syr("ܟܠܒܐ", 2), "ܟܠܒ̈ܐ") # Pluralize + + self.assertEqual(get_plural_form_syr("ܒܝܬ̈ܐ", 1), "ܒܝܬܐ") # Singularize + self.assertEqual(get_plural_form_syr("ܚܒܘܫ̈ܐ", 1), "ܚܒܘܫܐ") # Singularize + self.assertEqual(get_plural_form_syr("ܦܬܘܪ̈ܐ", 1), "ܦܬܘܪܐ") # Singularize + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_parse_syr.py b/test/test_parse_syr.py new file mode 100644 index 00000000..d9b0cd7e --- /dev/null +++ b/test/test_parse_syr.py @@ -0,0 +1,166 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +from datetime import datetime, timedelta +from dateutil import tz + +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.internal import FunctionNotLocalizedError +from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration +from lingua_franca.parse import extract_number, extract_numbers +from lingua_franca.parse import fuzzy_match +from lingua_franca.parse import get_gender +from lingua_franca.parse import match_one +from lingua_franca.parse import normalize +from lingua_franca.lang.parse_syr import extract_datetime_syr +from lingua_franca.lang.parse_syr import is_fractional_syr +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_language('syr') + set_default_lang('syr') + +def tearDownModule(): + unload_language('syr') + +class TestNormalize(unittest.TestCase): + + def test_extract_number(self): + self.assertEqual(extract_number("ܐܗܐ ܝܠܗ ܢܣܝܢܐ ܩܕܡܝܐ", + ordinals=True), 1) + self.assertEqual(extract_number("ܐܗܐ ܝܠܗ ܢܣܝܢܐ ܬܪܝܢܐ"), 2) + self.assertEqual(extract_number("ܐܗܐ ܝܠܗ ܢܣܝܢܐ ܪܒܝܥܝܐ"), 4) + self.assertEqual(extract_number("ܬܠܬܐ ܟ̈ܣܐ"), 3) + self.assertEqual(extract_number("ܚܕ ܘܦܠܓܐ ܟ̈ܣܐ"), 1.5) + self.assertEqual(extract_number("ܥܣܪܝܢ ܘܬܪܝܢ"), 22) + self.assertEqual(extract_number("ܬܪܝܢܡܐܐ"), 200) + self.assertEqual(extract_number("ܬܫܥܐ ܐܠܦܐ"), 9000) + self.assertEqual(extract_number("ܐܠܦܐ ܘܚܡܫܡܐܐ"), 1500) + self.assertEqual(extract_number("ܫܬܡܐܐ ܘܫܬܝܢ ܘܫܬܐ"), 666) + self.assertEqual(extract_number("ܬܪܝܢ ܡܠܝܘܢܐ"), 2000000) + self.assertEqual(extract_number("ܬܪܝܢ ܐܠܦܐ ܘܫܒܥܣܪ"), 2017) + self.assertEqual(extract_number("ܫܬܥܣܪ ܐܠܦܐ ܘܡܐܐ ܘܚܡܫܥܣܪ"), 16115) + self.assertEqual(extract_number("ܬܡܢܥܣܪ ܡܠܝܘܢܐ ܘܬܡܢܥܣܪ ܐܠܦܐ ܘܬܪܝܢܡܐܐ ܘܬܡܢܥܣܪ"), 18018218) + self.assertEqual(extract_number("ܬܪܝܢ ܡܠܝܘܢܐ ܘܚܡܫܡܐܐ ܐܠܦܐ"), 2500000) + + def test_extract_duration_syr(self): + self.assertEqual(extract_duration("10 ܪ̈ܦܦܐ"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 ܩܛܝܢܬ̈ܐ"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 ܫܥ̈ܐ"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 ܝܘܡܢ̈ܐ"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 ܫܒܘܥ̈ܐ"), + (timedelta(weeks=25), "")) + self.assertEqual(extract_duration("ܫܒܥܐ ܫܥ̈ܐ"), + (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 ܪ̈ܦܦܐ"), + (timedelta(seconds=7.5), "")) + self.assertEqual(extract_duration("ܬܡܢܝܐ ܘܦܠܓܐ ܝܘܡܢ̈ܐ ܘܬܠܬܝܢ ܘܬܫܥܐ ܪ̈ܦܦܐ"), + (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("ܡܬܒ ܡܐܢܐ ܙܒ̣ܢܢܝܐ ܩܐ ܬܠܬܝܢ ܩܛܝܢܬ̈ܐ ܐܚܪܢܐ"), + (timedelta(minutes=30), "ܡܬܒ ܡܐܢܐ ܙܒ̣ܢܢܝܐ ܩܐ ܐܚܪܢܐ")) + self.assertEqual(extract_duration("ܡܬܒ ܥܕܢܐ ܐܪܒܥܐ ܘܦܠܓܐ ܩܛܝܢܬ̈ܐ ܠܙܪܩܬܐ ܕܫܡܫܐ"), + (timedelta(minutes=4.5), "ܡܬܒ ܥܕܢܐ ܠܙܪܩܬܐ ܕܫܡܫܐ")) + self.assertEqual(extract_duration("ܐܗܐ ܨܘܪܬܐ ܙܝܘܥܬܐ ܟܐ ܓܪܫ ܥܕܢܐ ܚܕ ܫܥܬܐ ܘܚܡܫܝܢ ܘܫܒܥܐ ܘܦܠܓܐ ܩܛܝܢܬ̈ܐ"), + (timedelta(hours=1, minutes=57.5), + "ܐܗܐ ܨܘܪܬܐ ܙܝܘܥܬܐ ܟܐ ܓܪܫ ܥܕܢܐ")) + + def test_extractdatetime_syr(self): + + def extractWithFormat(text): + # BUG: Time is read as 2017-06-27 08:04:00 which is incorrect + date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm + [extractedDate, leftover] = extract_datetime_syr(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(text) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("ܗܫܐ ܝܠܗ ܥܕܢܐ", + "2017-06-27 13:04:00", "ܝܠܗ ܥܕܢܐ") + testExtract("ܚܕ ܪܦܦܐ ܝܬܝܪ", + "2017-06-27 13:04:01", "ܝܬܝܪ") + testExtract("ܝܠܗ ܚܕ ܩܛܝܢܐ", + "2017-06-27 13:05:00", "ܝܠܗ") + testExtract("ܬܪܝܢ ܩܛܝܢܬ̈ܐ", + "2017-06-27 13:06:00", "") + testExtract("ܝܠܗ ܥܕܢܐ ܚܫܝܚܬܐ", + "2017-06-27 13:04:00", "ܝܠܗ ܥܕܢܐ ܚܫܝܚܬܐ") + testExtract("ܐܢܐ ܒܥܝܢ ܩܐ ܚܕ ܫܥܬܐ ܐܚܪܢܐ", + "2017-06-27 14:04:00", "ܐܢܐ ܒܥܝܢ ܩܐ ܐܚܪܢܐ") + testExtract("1 ܪܦܦܐ ܐܚܪܢܐ", + "2017-06-27 13:04:01", "ܐܚܪܢܐ") + testExtract("2 ܪ̈ܦܦܐ ܐܚܪܢܐ", + "2017-06-27 13:04:02", "ܐܚܪܢܐ") + testExtract("ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ ܚܕ ܩܛܝܢܐ ܒܬܪ", + "2017-06-27 13:05:00", "ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ ܒܬܪ") + testExtract("ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ ܦܠܓܐ ܫܥܬܐ ܐܚܪܢܐ", + "2017-06-27 13:34:00", "ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ ܐܚܪܢܐ") + testExtract("ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ ܚܡܫܐ ܝܘܡܢ̈ܐ ܒܬܪ", + "2017-07-02 13:04:00", "ܡܬܒ ܡܐܢܐ ܙܒܢܢܝܐ ܩܐ") + testExtract("ܝܘܡܐ ܐܚܪܢܐ", + "2017-06-29 00:00:00", "") + testExtract("ܡܘܕܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ ܝܘܡܐ ܐܚܪܢܐ؟", + "2017-06-29 00:00:00", "ܡܘܕܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ") + testExtract("ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ ܥܪܘܒܬܐ ܩܕܡ ܛܗܪܐ؟", + "2017-06-30 08:00:00", "ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ") + testExtract("ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ ܩܘܕܡܐ ܕܐܬܐ؟", + "2017-06-28 00:00:00", "ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ") + testExtract("ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ ܐܕܝܘܡ ܒܬܪ ܛܗܪܐ؟", + "2017-06-27 15:00:00", "ܕܐܟܝ ܝܠܗ ܡܘܙܓܐ ܕܐܐܪ") + testExtract("ܕܟܪ ܩܖܝ ܩܐ ܝܡܝ ܬܡܢܝܐ ܫܒܘܥ̈ܐ ܘܬܪܝܢ ܝܘܡܢ̈ܐ", + "2017-08-24 00:00:00", "ܕܟܪ ܩܖܝ ܩܐ ܝܡܝ") + + def test_multiple_numbers(self): + self.assertEqual(extract_numbers("ܚܕ ܬܪܝܢ ܬܠܬܐ"), + [1.0, 2.0, 3.0]) + self.assertEqual(extract_numbers("ܥܣܪܝܢ ܘܬܠܬܐ"), + [23]) + self.assertEqual(extract_numbers("ܥܣܪܝܢ ܬܠܬܐ"), + [20, 3]) + self.assertEqual(extract_numbers("ܥܣܪܐ ܥܣܪܝܢ ܬܠܬܐ ܚܡܫܥܣܪ ܐܠܦܐ ܘܫܬܝܢ ܫܬܥܣܪ"), + [10, 20, 3, 15060, 16]) + + def test_is_fraction_syr(self): + self.assertEqual(is_fractional_syr("ܦܠܓܐ"), 1.0 / 2) + self.assertEqual(is_fractional_syr("ܦܠܓܘܬ"), 1.0 / 2) + self.assertEqual(is_fractional_syr("ܬܘܠܬܐ"), 1.0 / 3) + self.assertEqual(is_fractional_syr("ܪܘܒܥܐ"), 1.0 / 4) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܬܠܬܐ"), 1.0 / 3) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܐܪܒܥܐ"), 1.0 / 4) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܫܒܥܐ"), 1.0 / 7) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܥܣܪܝܢ"), 1.0 / 20) + self.assertEqual(is_fractional_syr("ܚܕܐ ܡܢ ܥܣܪܝܢ"), 1.0 / 20) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܬܠܬܝܢ"), 1.0 / 30) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܡܐܐ"), 1.0 / 100) + self.assertEqual(is_fractional_syr("ܚܕܐ ܡܢ ܡܐܐ"), 1.0 / 100) + self.assertEqual(is_fractional_syr("ܚܕܐ ܡܢ ܐܠܦܐ"), 1.0 / 1000) + self.assertEqual(is_fractional_syr("ܬܠܬܐ ܡܢ ܐܪܒܥܐ"), 3.0 / 4) + self.assertEqual(is_fractional_syr("ܚܡܫܐ ܡܢ ܫܬܐ"), 5.0 / 6) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܐܠܦܐ"), 1.0 / 1000) + self.assertEqual(is_fractional_syr("ܚܕ ܡܢ ܡܠܝܘܢܐ"), 1.0 / 1000000) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file