From a1c5d8b056d3b073d079605e21d3e6cbb21443ba Mon Sep 17 00:00:00 2001 From: hamidreza kalbasi Date: Thu, 24 Dec 2020 21:43:37 +0330 Subject: [PATCH 1/2] implement farsi support --- lingua_franca/internal.py | 5 +- lingua_franca/lang/common_data_fa.py | 115 +++++ lingua_franca/lang/format_fa.py | 306 ++++++++++++++ lingua_franca/lang/parse_fa.py | 393 +++++++++++++++++ lingua_franca/res/text/fa-ir/and.word | 1 + lingua_franca/res/text/fa-ir/date_time.json | 180 ++++++++ .../res/text/fa-ir/date_time_test.json | 36 ++ lingua_franca/res/text/fa-ir/day.word | 1 + lingua_franca/res/text/fa-ir/days.word | 1 + lingua_franca/res/text/fa-ir/hour.word | 1 + lingua_franca/res/text/fa-ir/hours.word | 1 + lingua_franca/res/text/fa-ir/minute.word | 1 + lingua_franca/res/text/fa-ir/minutes.word | 1 + lingua_franca/res/text/fa-ir/normalize.json | 141 +++++++ lingua_franca/res/text/fa-ir/or.word | 1 + lingua_franca/res/text/fa-ir/second.word | 1 + lingua_franca/res/text/fa-ir/seconds.word | 1 + test/test_format_fa.py | 394 ++++++++++++++++++ test/test_parse_fa.py | 170 ++++++++ 19 files changed, 1748 insertions(+), 2 deletions(-) create mode 100644 lingua_franca/lang/common_data_fa.py create mode 100644 lingua_franca/lang/format_fa.py create mode 100644 lingua_franca/lang/parse_fa.py create mode 100644 lingua_franca/res/text/fa-ir/and.word create mode 100644 lingua_franca/res/text/fa-ir/date_time.json create mode 100644 lingua_franca/res/text/fa-ir/date_time_test.json create mode 100644 lingua_franca/res/text/fa-ir/day.word create mode 100644 lingua_franca/res/text/fa-ir/days.word create mode 100644 lingua_franca/res/text/fa-ir/hour.word create mode 100644 lingua_franca/res/text/fa-ir/hours.word create mode 100644 lingua_franca/res/text/fa-ir/minute.word create mode 100644 lingua_franca/res/text/fa-ir/minutes.word create mode 100644 lingua_franca/res/text/fa-ir/normalize.json create mode 100644 lingua_franca/res/text/fa-ir/or.word create mode 100644 lingua_franca/res/text/fa-ir/second.word create mode 100644 lingua_franca/res/text/fa-ir/seconds.word create mode 100644 test/test_format_fa.py create mode 100644 test/test_parse_fa.py diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index 5a47ef7a..7b996c2f 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -8,12 +8,13 @@ from lingua_franca import config _SUPPORTED_LANGUAGES = ("ca", "cs", "da", "de", "en", "es", "fr", "hu", - "it", "nl", "pl", "pt", "sl", "sv") + "it", "nl", "pl", "pt", "sl", "sv", "fa") _SUPPORTED_FULL_LOCALIZATIONS = ("ca-es", "cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", "pl-pl", - "pt-pt", "ru-ru", "sl-si", "sv-se", "tr-tr") + "fa-ir", "pt-pt", "ru-ru", "sl-si", + "sv-se", "tr-tr") _DEFAULT_FULL_LANG_CODES = {'ca': 'ca-es', 'cs': 'cs-cz', diff --git a/lingua_franca/lang/common_data_fa.py b/lingua_franca/lang/common_data_fa.py new file mode 100644 index 00000000..f44a2198 --- /dev/null +++ b/lingua_franca/lang/common_data_fa.py @@ -0,0 +1,115 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_FUNCTION_NOT_IMPLEMENTED_WARNING = "تابع خواسته شده در زبان فارسی پیاده سازی نشده است." + + +_FRACTION_STRING_FA = { + 2: 'دوم', + 3: 'سوم', + 4: 'چهارم', + 5: 'پنجم', + 6: 'ششم', + 7: 'هفتم', + 8: 'هشتم', + 9: 'نهم', + 10: 'دهم', + 11: 'یازدهم', + 12: 'دوازدهم', + 13: 'سیزدهم', + 14: 'چهاردهم', + 15: 'پونزدهم', + 16: 'شونزدهم', + 17: 'هیفدهم', + 18: 'هیجدهم', + 19: 'نوزدهم', + 20: 'بیستم' +} + + +_FARSI_ONES = [ + "", + "یک", + "دو", + "سه", + "چهار", + "پنج", + "شش", + "هفت", + "هشت", + "نه", + "ده", + "یازده", + "دوازده", + "سیزده", + "چهارده", + "پونزده", + "شونزده", + "هیفده", + "هیجده", + "نوزده", +] + +_FARSI_TENS = [ + "", + "ده", + "بیست", + "سی", + "چهل", + "پنجاه", + "شصت", + "هفتاد", + "هشتاد", + "نود", +] + +_FARSI_HUNDREDS = [ + "", + "صد", + "دویست", + "سیصد", + "چهارصد", + "پانصد", + "ششصد", + "هفتصد", + "هشتصد", + "نهصد", +] + +_FARSI_BIG = [ + '', + 'هزار', + 'میلیون', + "میلیارد", + 'تریلیون', + "تریلیارد", +] + + +_FORMAL_VARIANT = { + 'هفده': 'هیفده', + 'هجده': 'هیجده', + 'شانزده': 'شونزده', + 'پانزده': 'پونزده', +} + + +_FARSI_FRAC = ["", "ده", "صد"] +_FARSI_FRAC_BIG = ["", "هزار", "میلیونی", "میلیاردی"] + +_FARSI_SEPERATOR = ' و ' \ No newline at end of file diff --git a/lingua_franca/lang/format_fa.py b/lingua_franca/lang/format_fa.py new file mode 100644 index 00000000..10f944a7 --- /dev/null +++ b/lingua_franca/lang/format_fa.py @@ -0,0 +1,306 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_fa import \ + _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FARSI_BIG, _FARSI_SEPERATOR, \ + _FARSI_FRAC, _FARSI_FRAC_BIG, _FRACTION_STRING_FA, _FORMAL_VARIANT +import math +from lingua_franca.internal import lookup_variant +from enum import IntEnum +from functools import wraps + +class NumberVariantFA(IntEnum): + CONVERSATIONAL = 0 + FORMAL = 1 + +lookup_number = lookup_variant({ + "default": NumberVariantFA.CONVERSATIONAL, + "conversational": NumberVariantFA.CONVERSATIONAL, + "formal": NumberVariantFA.FORMAL, +}) + +def _applyVariant(text, variant): + if variant == NumberVariantFA.FORMAL: + print("Doing") + print(text) + for key, value in _FORMAL_VARIANT.items(): + print("xxx "+value+" "+key) + text = text.replace(value, key) + print(text) + return text + +def _handleVariant(func): + + @wraps(func) + @lookup_variant({ + "default": NumberVariantFA.CONVERSATIONAL, + "conversational": NumberVariantFA.CONVERSATIONAL, + "formal": NumberVariantFA.FORMAL, + }) + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + print(kwargs, result) + if 'variant' in kwargs: + return _applyVariant(result, kwargs['variant']) + else: + return result + return wrapper + +@_handleVariant +def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None): + """ Farsi helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_FA[den] + if whole == 0: + if num == 1: + return_string = 'یک {}'.format(den_str) + else: + return_string = '{} {}'.format(num, den_str) + elif num == 1: + return_string = '{} و یک {}'.format(whole, den_str) + else: + return_string = '{} و {} {}'.format(whole, num, den_str) + return return_string + + +def _float2tuple(value, _precision): + pre = int(value) + + post = abs(value - pre) * 10**_precision + if abs(round(post) - post) < 0.01: + # We generally floor all values beyond our precision (rather than + # rounding), but in cases where we have something like 1.239999999, + # which is probably due to python's handling of floats, we actually + # want to consider it as 1.24 instead of 1.23 + post = int(round(post)) + else: + post = int(math.floor(post)) + + while post != 0: + x, y = divmod(post, 10) + if y != 0: + break + post = x + _precision -= 1 + + return pre, post, _precision + + +def _cardinal3(number): + if (number < 19): + return _FARSI_ONES[number] + if (number < 100): + x, y = divmod(number, 10) + if y == 0: + return _FARSI_TENS[x] + return _FARSI_TENS[x] + _FARSI_SEPERATOR + _FARSI_ONES[y] + x, y = divmod(number, 100) + if y == 0: + return _FARSI_HUNDREDS[x] + return _FARSI_HUNDREDS[x] + _FARSI_SEPERATOR + _cardinal3(y) + +def _cardinalPos(number): + x = number + res = '' + for b in _FARSI_BIG: + x, y = divmod(x, 1000) + if (y == 0): + continue + yx = _cardinal3(y) + if y == 1 and b == 'هزار': + yx = b + elif b != '': + yx += ' ' + b + if (res == ''): + res = yx + else: + res = yx + _FARSI_SEPERATOR + res + return res + +def _fractional(number, l): + if (number / 10**l == 0.5): + return "نیم" + x = _cardinalPos(number) + ld3, lm3 = divmod(l, 3) + ltext = (_FARSI_FRAC[lm3] + " " + _FARSI_FRAC_BIG[ld3]).strip() + 'م' + return x + " " + ltext + +def _to_ordinal(number): + r = _to_cardinal(number, 0) + if (r[-1] == 'ه' and r[-2] == 'س'): + return r[:-1] + 'وم' + return r + 'م' + +def _to_ordinal_num(value): + return str(value)+"م" + +def _to_cardinal(number, places): + if number < 0: + return "منفی " + _to_cardinal(-number, places) + if (number == 0): + return "صفر" + x, y, l = _float2tuple(number, places) + if y == 0: + return _cardinalPos(x) + if x == 0: + return _fractional(y, l) + return _cardinalPos(x) + _FARSI_SEPERATOR + _fractional(y, l) + +@_handleVariant +def pronounce_number_fa(number, places=2, scientific=False, + ordinals=False, variant=None): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "بینهایت" + elif num == float("-inf"): + return "منفی بینهایت" + if scientific: + if number == 0: + return "صفر" + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + return '{}{} ضرب در ده به توان {}{}'.format( + 'منفی ' if float(n) < 0 else '', + pronounce_number_fa( + abs(float(n)), places, False, ordinals=False), + 'منفی ' if power < 0 else '', + pronounce_number_fa(abs(power), places, False, ordinals=False)) + if ordinals: + return _to_ordinal(number) + return _to_cardinal(number, places) + +@_handleVariant +def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M %p") + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0] == '0': + speak += pronounce_number_fa(int(string[1])) + else: + speak = pronounce_number_fa(int(string[0:2])) + if not string[3:5] == '00': + speak += " و " + if string[3] == '0': + speak += pronounce_number_fa(int(string[4])) + else: + speak += pronounce_number_fa(int(string[3:5])) + speak += ' دقیقه' + return speak + else: + if dt.hour == 0 and dt.minute == 0: + return "نیمه شب" + elif dt.hour == 12 and dt.minute == 0: + return "ظهر" + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + if dt.minute == 15: + speak = pronounce_number_fa(hour) + " و ربع" + elif dt.minute == 30: + speak = pronounce_number_fa(hour) + " و نیم" + elif dt.minute == 45: + next_hour = (dt.hour + 1) % 12 or 12 + speak = "یه ربع به " + pronounce_number_fa(next_hour) + else: + speak = pronounce_number_fa(hour) + + if dt.minute == 0: + if not use_ampm: + return speak + else: + speak += " و " + pronounce_number_fa(dt.minute) + ' دقیقه' + + if use_ampm: + if dt.hour > 11: + speak += " بعد از ظهر" + else: + speak += " قبل از ظهر" + + return speak diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py new file mode 100644 index 00000000..bda9293f --- /dev/null +++ b/lingua_franca/lang/parse_fa.py @@ -0,0 +1,393 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_fa import _FARSI_BIG, \ + _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FORMAL_VARIANT + +import re +import json +from lingua_franca.internal import resolve_resource_file + + +def _is_number(s): + try: + float(s) + return True + except ValueError: + return False + +def _parse_sentence(text): + for key, value in _FORMAL_VARIANT.items(): + text = text.replace(key, value) + ar = text.split() + result = [] + current_number = 0 + current_words = [] + s = 0 + step = 10 + mode = 'init' + def finish_num(): + nonlocal current_number + nonlocal s + nonlocal result + nonlocal mode + nonlocal current_words + current_number += s + if current_number != 0: + result.append((current_number, current_words)) + s = 0 + current_number = 0 + current_words = [] + mode = 'init' + for x in ar: + if x == "و": + if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': + mode += '_va' + current_words.append(x) + elif mode == 'num': + current_words.append(x) + else: + finish_num() + result.append(x) + elif x == "نیم": + current_words.append(x) + current_number += 0.5 + finish_num() + elif x in _FARSI_ONES: + t = _FARSI_ONES.index(x) + if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': + if not(t < 10 and mode == 'num_ten_va'): + finish_num() + current_words.append(x) + s += t + mode = 'num_one' + elif x in _FARSI_TENS: + if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': + finish_num() + current_words.append(x) + s += _FARSI_TENS.index(x)*10 + mode = 'num_ten' + elif x in _FARSI_HUNDREDS: + if mode != 'init' and mode != 'num': + finish_num() + current_words.append(x) + s += _FARSI_HUNDREDS.index(x)*100 + mode = 'num_hundred' + elif x in _FARSI_BIG: + current_words.append(x) + d = _FARSI_BIG.index(x) + if mode == 'init' and d == 1: + s = 1 + s *= 10**(3*d) + current_number += s + s = 0 + mode = 'num' + elif _is_number(x): + current_words.append(x) + current_number = float(x) + finish_num() + else: + finish_num() + result.append(x) + if mode[:3] == 'num': + finish_num() + return result + + +_time_units = { + 'ثانیه': timedelta(seconds=1), + 'دقیقه': timedelta(minutes=1), + 'ساعت': timedelta(hours=1), +} + +_date_units = { + 'روز': timedelta(days=1), + 'هفته': timedelta(weeks=1), +} + +def extract_duration_fa(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + remainder = [] + ar = _parse_sentence(text) + current_number = None + result = timedelta(0) + for x in ar: + if x == "و": + continue + elif type(x) == tuple: + current_number = x + elif x in _time_units: + result += _time_units[x] * current_number[0] + current_number = None + elif x in _date_units: + result += _date_units[x] * current_number[0] + current_number = None + else: + if current_number: + remainder.extend(current_number[1]) + remainder.append(x) + current_number = None + return (result, " ".join(remainder)) + + +def extract_datetime_fa(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + if text == "": + return None + text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ + .replace('?', '').replace("پس فردا", "پسفردا") \ + .replace('یک شنبه', 'یکشنبه') \ + .replace('دو شنبه', 'دوشنبه') \ + .replace('سه شنبه', 'سهشنبه') \ + .replace('چهار شنبه', 'چهارشنبه') \ + .replace('پنج شنبه', 'پنجشنبه') \ + .replace('بعد از ظهر', 'بعدازظهر') \ + + + if not anchorDate: + anchorDate = datetime.now() + today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) + today_weekday = int(anchorDate.strftime("%w")) + weekday_names = [ + 'دوشنبه', + 'سهشنبه', + 'چهارشنبه', + 'پنجشنبه', + 'جمعه', + 'شنبه', + 'یکشنبه', + ] + daysDict = { + 'پریروز': today + timedelta(days= -2), + 'دیروز': today + timedelta(days= -1), + 'امروز': today, + 'فردا': today + timedelta(days= 1), + 'پسفردا': today + timedelta(days= 2), + } + timesDict = { + 'صبح': timedelta(hours=8), + 'بعدازظهر': timedelta(hours=15), + } + exactDict = { + 'الان': anchorDate, + } + nextWords = ["بعد", "دیگه"] + prevWords = ["پیش", "قبل"] + ar = _parse_sentence(text) + mode = 'none' + number_seen = None + delta_seen = timedelta(0) + remainder = [] + result = None + for x in ar: + handled = 1 + if mode == 'finished': + remainder.append(x) + elif x == 'و' and mode[:5] == 'delta': + pass + elif type(x) == tuple: + number_seen = x + elif x in weekday_names: + dayOffset = (weekday_names.index(x) + 1) - today_weekday + if dayOffset < 0: + dayOffset += 7 + result = today + timedelta(days=dayOffset) + mode = 'time' + elif x in exactDict: + result = exactDict[x] + mode = 'finished' + elif x in daysDict: + result = daysDict[x] + mode = 'time' + elif x in timesDict and mode == 'time': + result += timesDict[x] + mode = 'finish' + elif x in _date_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _date_units[x] * k + if mode != 'delta_time': + mode = 'delta_date' + elif x in _time_units: + k = 1 + if (number_seen): + k = number_seen[0] + number_seen = None + delta_seen += _time_units[x] * k + mode = 'delta_time' + elif x in nextWords or x in prevWords: + # Give up instead of incorrect result + if mode == 'time': + return None + sign = 1 if x in nextWords else -1 + if mode == 'delta_date': + result = today + delta_seen + mode = 'time' + elif mode == 'delta_time': + result = anchorDate + delta_seen + mode = 'finished' + else: + handled = 0 + else: + handled = 0 + if handled == 1: + continue + if number_seen: + remainder.extend(number_seen[1]) + number_seen = None + remainder.append(x) + return (result, " ".join(remainder)) + +def is_fractional_fa(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + if input_str.endswith('s', -1): + input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + if short_scale: + for num in _SHORT_ORDINAL_FA: + if num > 2: + fracts[_SHORT_ORDINAL_FA[num]] = num + else: + for num in _LONG_ORDINAL_FA: + if num > 2: + fracts[_LONG_ORDINAL_FA[num]] = num + + if input_str.lower() in fracts: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_fa(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + + ar = _parse_sentence(text) + result = [] + for x in ar: + if type(x) == tuple: + result.append(x[0]) + return result + + +def extract_number_fa(text, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + x = extract_numbers_fa(text, ordinals=ordinals) + if (len(x) == 0): + return False + return x[0] + +class EnglishNormalizer(Normalizer): + with open(resolve_resource_file("text/en-us/normalize.json")) as f: + _default_config = json.load(f) + + +def normalize_fa(text, remove_articles=True): + """ English string normalization """ + return EnglishNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/res/text/fa-ir/and.word b/lingua_franca/res/text/fa-ir/and.word new file mode 100644 index 00000000..438fc2d5 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/and.word @@ -0,0 +1 @@ +و \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/date_time.json b/lingua_franca/res/text/fa-ir/date_time.json new file mode 100644 index 00000000..1a43989f --- /dev/null +++ b/lingua_franca/res/text/fa-ir/date_time.json @@ -0,0 +1,180 @@ +{ + "decade_format": { + "1": { + "match": "^\\d$", + "format": "{x}" + }, + "2": { + "match": "^1\\d$", + "format": "{xx}" + }, + "3": { + "match": "^\\d0$", + "format": "{x0}" + }, + "4": { + "match": "^[2-9]\\d$", + "format": "{x0} {x}" + }, + "default": "{number}" + }, + "hundreds_format": { + "1": { + "match": "^\\d{3}$", + "format": "{x_in_x00} hundred" + }, + "default": "{number}" + }, + "thousand_format": { + "1": { + "match": "^\\d00\\d$", + "format": "{x_in_x000} thousand" + }, + "2": { + "match": "^1\\d00$", + "format": "{xx_in_xx00} hundred" + }, + "3": { + "match": "^\\d{2}00$", + "format": "{x0_in_x000} {x_in_x00} hundred" + }, + "4": { + "match": "^(1\\d{3})|(\\d0\\d{2})$", + "format": "{xx_in_xx00}" + }, + "5": { + "match": "^\\d{4}$", + "format": "{x0_in_x000} {x_in_x00}" + }, + "default": "{number}" + }, + "year_format": { + "1": { + "match": "^\\d\\d?$", + "format": "{formatted_decade} {bc}" + }, + "2": { + "match": "^\\d00$", + "format": "{formatted_hundreds} {bc}" + }, + "3": { + "match": "^\\d{3}$", + "format": "{formatted_hundreds} {formatted_decade} {bc}" + }, + "4": { + "match": "^\\d{2}00$", + "format": "{formatted_thousand} {bc}" + }, + "5": { + "match": "^\\d00\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "6": { + "match": "^\\d{2}0\\d$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "7": { + "match": "^\\d{4}$", + "format": "{formatted_thousand} {formatted_decade} {bc}" + }, + "default": "{year} {bc}", + "bc": "بعد از میلاد" + }, + "date_format": { + "date_full": "{weekday}, {day} {month} {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "امروز", + "tomorrow": "فردا", + "yesterday": "دیروز" + }, + "date_time_format": { + "date_time": "{formatted_date} ساعت {formatted_time}" + }, + "weekday": { + "0": "دوشنبه", + "1": "سه شنبه", + "2": "چهارشنبه", + "3": "پنج شنبه", + "4": "جمعه", + "5": "شنبه", + "6": "یکشنبه" + }, + "date": { + "1": "یکم", + "2": "دوم", + "3": "سوم", + "4": "چهارم", + "5": "پنجم", + "6": "ششم", + "7": "هفتم", + "8": "هشتم", + "9": "نهم", + "10": "دهم", + "11": "یازدهم", + "12": "دوازدهم", + "13": "سیزدهم", + "14": "چهاردهم", + "15": "پونزدهم", + "16": "شونزدهم", + "17": "هیفدهم", + "18": "هیجدهم", + "19": "نوزدهم", + "20": "بیستم", + "21": "بیست و یکم", + "22": "بیست و دوم", + "23": "بیست و سوم", + "24": "بیست و چهارم", + "25": "بیست و پنجم", + "26": "بیست و ششم", + "27": "بیست و هفتم", + "28": "بیست و هشتم", + "29": "بیست و نهم", + "30": "سیم", + "31": "سی و یکم" + }, + "month": { + "1": "ژانویه", + "2": "فوریه", + "3": "مارس", + "4": "آوریل", + "5": "مه", + "6": "جون", + "7": "جولای", + "8": "آگوست", + "9": "سپتامبر", + "10": "اکتبر", + "11": "نوامبر", + "12": "دسامبر" + }, + "number": { + "0": "صفر", + "1": "یک", + "2": "دو", + "3": "سه", + "4": "چهار", + "5": "پنج", + "6": "شش", + "7": "هفت", + "8": "هشت", + "9": "نه", + "10": "ده", + "11": "یازده", + "12": "دوازده", + "13": "سیزده", + "14": "چهارده", + "15": "پونزده", + "16": "شونزده", + "17": "هیفده", + "18": "هیجده", + "19": "نوزده", + "20": "بیست", + "30": "سی", + "40": "چهل", + "50": "پنجاه", + "60": "شصت", + "70": "هفتاد", + "80": "هشتاد", + "90": "نود" + } +} diff --git a/lingua_franca/res/text/fa-ir/date_time_test.json b/lingua_franca/res/text/fa-ir/date_time_test.json new file mode 100644 index 00000000..72321e35 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/date_time_test.json @@ -0,0 +1,36 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "یک بعد از میلاد" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ده بعد از میلاد" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده دوازده" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده چهل و شش" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیجده صفر هفت" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیفده هیفده" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "نوزده هشتاد و هشت"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و نه"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست هیجده"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست بیست و یک"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست سی"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "دو هزار و صد" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و یک بیست بعد از میلاد" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و دو چهل و یک بعد از میلاد" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "پنجاه و دو هزار" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "فردا"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "امروز"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "دیروز"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت یک و بیست و دو دقیقه بعد از ظهر"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت سیزده و بیست و دو دقیقه"} + } +} diff --git a/lingua_franca/res/text/fa-ir/day.word b/lingua_franca/res/text/fa-ir/day.word new file mode 100644 index 00000000..dfc15b79 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/day.word @@ -0,0 +1 @@ +روز \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/days.word b/lingua_franca/res/text/fa-ir/days.word new file mode 100644 index 00000000..dfc15b79 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/days.word @@ -0,0 +1 @@ +روز \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/hour.word b/lingua_franca/res/text/fa-ir/hour.word new file mode 100644 index 00000000..3f2b7b16 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/hour.word @@ -0,0 +1 @@ +ساعت \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/hours.word b/lingua_franca/res/text/fa-ir/hours.word new file mode 100644 index 00000000..3f2b7b16 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/hours.word @@ -0,0 +1 @@ +ساعت \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/minute.word b/lingua_franca/res/text/fa-ir/minute.word new file mode 100644 index 00000000..1e9a05d8 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/minute.word @@ -0,0 +1 @@ +دقیقه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/minutes.word b/lingua_franca/res/text/fa-ir/minutes.word new file mode 100644 index 00000000..1e9a05d8 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/minutes.word @@ -0,0 +1 @@ +دقیقه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/normalize.json b/lingua_franca/res/text/fa-ir/normalize.json new file mode 100644 index 00000000..4126c02e --- /dev/null +++ b/lingua_franca/res/text/fa-ir/normalize.json @@ -0,0 +1,141 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "I'd": "I would", + "I'll": "I will", + "I'm": "I am", + "I've": "I have", + "ain't": "is not", + "aren't": "are not", + "can't": "can not", + "could've": "could have", + "couldn't": "could not", + "didn't": "did not", + "doesn't": "does not", + "don't": "do not", + "gonna": "going to", + "gotta": "got to", + "hadn't": "had not", + "hasn't": "has not", + "haven't": "have not", + "he'd": "he would", + "he'll": "he will", + "he's": "he is", + "how'd": "how did", + "how'll": "how will", + "how's": "how is", + "isn't": "is not", + "it'd": "it would", + "it'll": "it will", + "it's": "it is", + "might've": "might have", + "mightn't": "might not", + "must've": "must have", + "mustn't": "must not", + "needn't": "need not", + "oughtn't": "ought not", + "shan't": "shall not", + "she'd": "she would", + "she'll": "she will", + "she's": "she is", + "should've": "should have", + "shouldn't": "should not", + "somebody's": "somebody is", + "someone'd": "someone would", + "someone'll": "someone will", + "someone's": "someone is", + "that'd": "that would", + "that'll": "that will", + "that's": "that is", + "there'd": "there would", + "there're": "there are", + "there's": "there is", + "they'd": "they would", + "they'll": "they will", + "they're": "they are", + "they've": "they have", + "wasn't": "was not", + "we'd": "we would", + "we'll": "we will", + "we're": "we are", + "we've": "we have", + "weren't": "were not", + "what'd": "what did", + "what'll": "what will", + "what're": "what are", + "what's": "what is", + "what've": "what have", + "whats": "what is", + "when'd": "when did", + "when's": "when is", + "where'd": "where did", + "where's": "where is", + "where've": "where have", + "who'd": "who would", + "who'd've": "who would have", + "who'll": "who will", + "who're": "who are", + "who's": "who is", + "who've": "who have", + "why'd": "why did", + "why're": "why are", + "why's": "why is", + "won't": "will not", + "won't've": "will not have", + "would've": "would have", + "wouldn't": "would not", + "wouldn't've": "would not have", + "y'ain't": "you are not", + "y'aint": "you are not", + "y'all": "you all", + "ya'll": "you all", + "you'd": "you would", + "you'd've": "you would have", + "you'll": "you will", + "you're": "you are", + "you've": "you have" + }, + "word_replacements": {}, + "number_replacements": { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + "eleven": "11", + "twelve": "12", + "thirteen": "13", + "fourteen": "14", + "fifteen": "15", + "sixteen": "16", + "seventeen": "17", + "eighteen": "18", + "nineteen": "19", + "twenty": "20", + "thirty": "30", + "forty": "40", + "fifty": "50", + "sixty": "60", + "seventy": "70", + "eighty": "80", + "ninety": "90" + }, + "stopwords": [], + "articles": [ + "the", + "a", + "an" + ] +} \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/or.word b/lingua_franca/res/text/fa-ir/or.word new file mode 100644 index 00000000..aa43ee0c --- /dev/null +++ b/lingua_franca/res/text/fa-ir/or.word @@ -0,0 +1 @@ +یا \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/second.word b/lingua_franca/res/text/fa-ir/second.word new file mode 100644 index 00000000..3d2bee65 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/second.word @@ -0,0 +1 @@ +ثانیه \ No newline at end of file diff --git a/lingua_franca/res/text/fa-ir/seconds.word b/lingua_franca/res/text/fa-ir/seconds.word new file mode 100644 index 00000000..3d2bee65 --- /dev/null +++ b/lingua_franca/res/text/fa-ir/seconds.word @@ -0,0 +1 @@ +ثانیه \ No newline at end of file diff --git a/test/test_format_fa.py b/test/test_format_fa.py new file mode 100644 index 00000000..8bc01119 --- /dev/null +++ b/test/test_format_fa.py @@ -0,0 +1,394 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import unittest +import datetime +import ast +import warnings +import sys +from pathlib import Path + +# TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES, +# or make it public somehow +from lingua_franca import load_languages, unload_languages, set_default_lang, \ + get_primary_lang_code, get_active_langs, get_supported_langs +from lingua_franca.internal import UnsupportedLanguageError +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import nice_date +from lingua_franca.format import nice_date_time +from lingua_franca.format import nice_year +from lingua_franca.format import nice_duration +from lingua_franca.format import pronounce_number +from lingua_franca.format import date_time_format +from lingua_franca.format import join_list + + +def setUpModule(): + load_languages(get_supported_langs()) + # TODO spin English tests off into another file, like other languages, so we + # don't have to do this confusing thing in the "master" test_format.py + set_default_lang('fa-ir') + + +def tearDownModule(): + unload_languages(get_active_langs()) + + +NUMBERS_FIXTURE_EN = { + 1.435634: '1.436', + 2: '2', + 5.0: '5', + 0.027: '0.027', + 0.5: 'یک دوم', + 1.333: '1 و یک سوم', + 2.666: '2 و 2 سوم', + 0.25: 'یک چهارم', + 1.25: '1 و یک چهارم', + 0.75: '3 چهارم', + 1.75: '1 و 3 چهارم', + 3.4: '3 و 2 پنجم', + 16.8333: '16 و 5 ششم', + 12.5714: '12 و 4 هفتم', + 9.625: '9 و 5 هشتم', + 6.777: '6 و 7 نهم', + 3.1: '3 و یک دهم', + 2.272: '2 و 3 یازدهم', + 5.583: '5 و 7 دوازدهم', + 8.384: '8 و 5 سیزدهم', + 0.071: 'یک چهاردهم', + 6.466: '6 و 7 پونزدهم', + 8.312: '8 و 5 شونزدهم', + 2.176: '2 و 3 هیفدهم', + 200.722: '200 و 13 هیجدهم', + 7.421: '7 و 8 نوزدهم', + 0.05: 'یک بیستم' +} + + +class TestNiceNumberFormat(unittest.TestCase): + + tmp_var = None + + def set_tmp_var(self, val): + self.tmp_var = val + + def test_convert_float_to_nice_number(self): + for number, number_str in NUMBERS_FIXTURE_EN.items(): + self.assertEqual(nice_number(number), number_str, + 'should format {} as {} and not {}'.format( + number, number_str, nice_number(number))) + + def test_specify_denominator(self): + self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), + '5 و یک دوم', + 'should format 5.5 as 5 and a half not {}'.format( + nice_number(5.5, denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, denominators=[1, 2]), + '2.333', + 'should format 2.333 as 2.333 not {}'.format( + nice_number(2.333, denominators=[1, 2]))) + + def test_no_speech(self): + self.assertEqual(nice_number(6.777, speech=False), + '6 7/9', + 'should format 6.777 as 6 7/9 not {}'.format( + nice_number(6.777, speech=False))) + self.assertEqual(nice_number(6.0, speech=False), + '6', + 'should format 6.0 as 6 not {}'.format( + nice_number(6.0, speech=False))) + + +class TestPronounceNumber(unittest.TestCase): + def test_convert_int(self): + self.assertEqual(pronounce_number(0), "صفر") + self.assertEqual(pronounce_number(1), "یک") + self.assertEqual(pronounce_number(10), "ده") + self.assertEqual(pronounce_number(15), "پونزده") + self.assertEqual(pronounce_number(20), "بیست") + self.assertEqual(pronounce_number(27), "بیست و هفت") + self.assertEqual(pronounce_number(30), "سی") + self.assertEqual(pronounce_number(33), "سی و سه") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1), "منفی یک") + self.assertEqual(pronounce_number(-10), "منفی ده") + self.assertEqual(pronounce_number(-15), "منفی پونزده") + self.assertEqual(pronounce_number(-20), "منفی بیست") + self.assertEqual(pronounce_number(-27), "منفی بیست و هفت") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number(0.05), "پنج صدم") + self.assertEqual(pronounce_number(-0.05), "منفی پنج صدم") + self.assertEqual(pronounce_number(1.234), + "یک و بیست و سه صدم") + self.assertEqual(pronounce_number(21.234), + "بیست و یک و بیست و سه صدم") + self.assertEqual(pronounce_number(21.234, places=1), + "بیست و یک و دو دهم") + self.assertEqual(pronounce_number(21.234, places=0), + "بیست و یک") + self.assertEqual(pronounce_number(21.234, places=3), + "بیست و یک و دویست و سی و چهار هزارم") + self.assertEqual(pronounce_number(21.234, places=4), + "بیست و یک و دویست و سی و چهار هزارم") + self.assertEqual(pronounce_number(21.234, places=5), + "بیست و یک و دویست و سی و چهار هزارم") + self.assertEqual(pronounce_number(-1.234), + "منفی یک و بیست و سه صدم") + self.assertEqual(pronounce_number(-21.234), + "منفی بیست و یک و بیست و سه صدم") + self.assertEqual(pronounce_number(-21.234, places=1), + "منفی بیست و یک و دو دهم") + + def test_convert_hundreds(self): + self.assertEqual(pronounce_number(100), "صد") + self.assertEqual(pronounce_number(666), "ششصد و شصت و شش") + self.assertEqual(pronounce_number(1456), "هزار و چهارصد و پنجاه و شش") + self.assertEqual(pronounce_number(103254654), "صد و سه میلیون و " + "دویست و پنجاه و چهار " + "هزار و ششصد و پنجاه و چهار") + self.assertEqual(pronounce_number(1512457), "یک میلیون و پانصد و دوازده هزار" + " و چهارصد و پنجاه و هفت") + self.assertEqual(pronounce_number(209996), "دویست و نه هزار و نهصد و نود و شش") + + def test_convert_scientific_notation(self): + self.assertEqual(pronounce_number(0, scientific=True), "صفر") + self.assertEqual(pronounce_number(33, scientific=True), + "سه و سه دهم ضرب در ده به توان یک") + self.assertEqual(pronounce_number(299792458, scientific=True), + "دو و نود و نه صدم ضرب در ده به توان هشت") + self.assertEqual(pronounce_number(299792448, places=6, + scientific=True), + "دو و نهصد و نود و هفت هزار و نهصد و بیست و چهار میلیونیم ضرب در ده به توان هشت") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True), + "یک و ششصد و هفتاد و دو هزارم ضرب در ده به توان منفی بیست و هفت") + + def test_ordinals(self): + self.assertEqual(pronounce_number(1, ordinals=True), "یکم") + self.assertEqual(pronounce_number(10, ordinals=True), "دهم") + self.assertEqual(pronounce_number(15, ordinals=True), "پونزدهم") + self.assertEqual(pronounce_number(20, ordinals=True), "بیستم") + self.assertEqual(pronounce_number(27, ordinals=True), "بیست و هفتم") + self.assertEqual(pronounce_number(30, ordinals=True), "سیم") + self.assertEqual(pronounce_number(33, ordinals=True), "سی و سوم") + self.assertEqual(pronounce_number(100, ordinals=True), "صدم") + self.assertEqual(pronounce_number(1000, ordinals=True), "هزارم") + self.assertEqual(pronounce_number(10000, ordinals=True), + "ده هزارم") + self.assertEqual(pronounce_number(18691, ordinals=True), + "هیجده هزار و ششصد و نود و یکم") + self.assertEqual(pronounce_number(1567, ordinals=True), + "هزار و پانصد و شصت و هفتم") + self.assertEqual(pronounce_number(18e6, ordinals=True), + "هیجده میلیونم") + self.assertEqual(pronounce_number(18e9, ordinals=True), + "هیجده میلیاردم") + def test_variant(self): + self.assertEqual(pronounce_number(18691, ordinals=True, variant="formal"), + "هجده هزار و ششصد و نود و یکم") + self.assertEqual(pronounce_number(15, variant='conversational'), "پونزده") + self.assertEqual(pronounce_number(15, variant='formal'), "پانزده") + self.assertEqual(nice_number(2.176, variant='formal'), "2 و 3 هفدهم") + dt = datetime.datetime(2017, 1, 31, + 16, 22, 3) + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='formal'), + "شانزده و بیست و دو دقیقه") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='conversational'), + "شونزده و بیست و دو دقیقه") + + + +# def nice_time(dt, lang="en-us", speech=True, use_24hour=False, +# use_ampm=False): + +class TestNiceDateFormat(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Read date_time_test.json files for test data + cls.test_config = {} + p = Path(date_time_format.config_path) + for sub_dir in [x for x in p.iterdir() if x.is_dir()]: + if (sub_dir / 'date_time_test.json').exists(): + print("Getting test for " + + str(sub_dir / 'date_time_test.json')) + with (sub_dir / 'date_time_test.json').open() as f: + cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) + + + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3) + + # Verify defaults haven't changed + self.assertEqual(nice_time(dt), + nice_time(dt, "fa-ir", True, False, False)) + + self.assertEqual(nice_time(dt), + "یک و بیست و دو دقیقه") + self.assertEqual(nice_time(dt, use_ampm=True), + "یک و بیست و دو دقیقه بعد از ظهر") + self.assertEqual(nice_time(dt, speech=False), + "1:22") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:22 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:22") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "سیزده و بیست و دو دقیقه") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "سیزده و بیست و دو دقیقه") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3) + self.assertEqual(nice_time(dt), + "یک") + self.assertEqual(nice_time(dt, use_ampm=True), + "یک بعد از ظهر") + self.assertEqual(nice_time(dt, speech=False), + "1:00") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:00 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:00") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "سیزده") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "سیزده") + + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3) + self.assertEqual(nice_time(dt), + "یک و دو دقیقه") + self.assertEqual(nice_time(dt, use_ampm=True), + "یک و دو دقیقه بعد از ظهر") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:02 PM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "سیزده و دو دقیقه") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "سیزده و دو دقیقه") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3) + self.assertEqual(nice_time(dt), + "دوازده و دو دقیقه") + self.assertEqual(nice_time(dt, use_ampm=True), + "دوازده و دو دقیقه قبل از ظهر") + self.assertEqual(nice_time(dt, speech=False), + "12:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "12:02 AM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "00:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "00:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "صفر و دو دقیقه") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "صفر و دو دقیقه") + + dt = datetime.datetime(2018, 2, 8, + 1, 2, 33) + self.assertEqual(nice_time(dt), + "یک و دو دقیقه") + self.assertEqual(nice_time(dt, use_ampm=True), + "یک و دو دقیقه قبل از ظهر") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "1:02 AM") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "01:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "01:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "یک و دو دقیقه") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "یک و دو دقیقه") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9) + self.assertEqual(nice_time(dt), + "دوازده و ربع") + self.assertEqual(nice_time(dt, use_ampm=True), + "دوازده و ربع بعد از ظهر") + + dt = datetime.datetime(2017, 1, 31, + 5, 30, 00) + self.assertEqual(nice_time(dt, use_ampm=True), + "پنج و نیم قبل از ظهر") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00) + self.assertEqual(nice_time(dt), + "یه ربع به دو") + + # TODO: failed because of و + #def test_nice_duration(self): + # self.assertEqual(nice_duration(1), "یک ثانیه") + # self.assertEqual(nice_duration(3), "سه ثانیه") + # self.assertEqual(nice_duration(1, speech=False), "0:01") + # self.assertEqual(nice_duration(61), "یک دقیقه و یک ثانیه") + # self.assertEqual(nice_duration(61, speech=False), "1:01") + # self.assertEqual(nice_duration(5000), + # "یک ساعت و بیست و سه دقیقه و بیست ثانیه") + # self.assertEqual(nice_duration(5000, speech=False), "1:23:20") + # self.assertEqual(nice_duration(50000), + # "سیزده ساعت و پنجاه و سه دقیقه و بیست ثانیه") + # self.assertEqual(nice_duration(50000, speech=False), "13:53:20") + # self.assertEqual(nice_duration(500000), + # "پنج روز و هیجده ساعت و پنجاه و سه دقیقه و بیست ثانیه") # nopep8 + # self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") + # self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), + # speech=False), + # "5d 18:53:20") + + def test_join(self): + self.assertEqual(join_list(None, "and"), "") + self.assertEqual(join_list([], "and"), "") + + self.assertEqual(join_list(["الف"], "و"), "الف") + self.assertEqual(join_list(["الف", "ب"], "و"), "الف و ب") + self.assertEqual(join_list(["الف", "ب"], "یا"), "الف یا ب") + + self.assertEqual(join_list(["الف", "ب", "ج"], "و"), "الف, ب و ج") + self.assertEqual(join_list(["الف", "ب", "ج"], "یا"), "الف, ب یا ج") + self.assertEqual(join_list(["الف", "ب", "ج"], "یا", ";"), "الف; ب یا ج") + self.assertEqual(join_list(["الف", "ب", "ج", "دال"], "یا"), "الف, ب, ج یا دال") + + self.assertEqual(join_list([1, "ب", 3, "دال"], "یا"), "1, ب, 3 یا دال") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_parse_fa.py b/test/test_parse_fa.py new file mode 100644 index 00000000..b87909b9 --- /dev/null +++ b/test/test_parse_fa.py @@ -0,0 +1,170 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +from datetime import datetime, timedelta + +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.internal import FunctionNotLocalizedError +from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration +from lingua_franca.parse import extract_number, extract_numbers +from lingua_franca.parse import fuzzy_match +from lingua_franca.parse import get_gender +from lingua_franca.parse import match_one +from lingua_franca.parse import normalize + + +def setUpModule(): + # TODO spin off English tests + load_language('fa') + set_default_lang('fa') + + +def tearDownModule(): + unload_language('fa') + +class TestNormalize(unittest.TestCase): + + def test_extract_number(self): + #self.assertEqual(extract_number("این تست اول است", + # ordinals=True), 1) + self.assertEqual(extract_number("این تست دو است"), 2) + #self.assertEqual(extract_number("این تست دوم است", + # ordinals=True), 2) + #self.assertEqual(extract_number("این تست سوم است", + # ordinals=True), 3.0) + #self.assertEqual(extract_number("چهارمی", ordinals=True), 4.0) + #self.assertEqual(extract_number("سی و ششمی", ordinals=True), 36.0) + self.assertEqual(extract_number("این تست شماره چهار است"), 4) + #self.assertEqual(extract_number("یک سوم فنجان"), 1.0 / 3.0) + self.assertEqual(extract_number("سه فنجان"), 3) + #self.assertEqual(extract_number("۱/۳ فنجان"), 1.0 / 3.0) + #self.assertEqual(extract_number("یک چهارم فنجان"), 0.25) + #self.assertEqual(extract_number("۱/۴ فنجان"), 0.25) + #self.assertEqual(extract_number("دو سوم فنجان"), 2.0 / 3.0) + #self.assertEqual(extract_number("سه چهارم فنجان"), 3.0 / 4.0) + #self.assertEqual(extract_number("یک و سه چهارم فنجان"), 1.75) + #self.assertEqual(extract_number("۱ فنجان و نیم"), 1.5) + #self.assertEqual(extract_number("یک فنجان و نیم"), 1.5) + self.assertEqual(extract_number("یک و نیم فنجان"), 1.5) + self.assertEqual(extract_number("بیست و دو"), 22) + #self.assertEqual(extract_number("بیست و دو و سه پنجم"), 22.6) + self.assertEqual(extract_number("دویست"), 200) + self.assertEqual(extract_number("نه هزار"), 9000) + self.assertEqual(extract_number("هزار و پانصد"), 1500) + self.assertEqual(extract_number("ششصد و شصت و شش"), 666) + self.assertEqual(extract_number("دو میلیون"), 2000000) + self.assertEqual(extract_number("دو هزار و هفده"), 2017) + self.assertEqual(extract_number("شانزده هزار و صد و پونزده"), 16115) + self.assertEqual(extract_number("هجده میلیون و هجده هزار و دویست و هجده"), 18018218) + self.assertEqual(extract_number("دو میلیون و پانصد هزار " + "تن گوشت یخ زده"), 2500000) + + def test_extract_duration_en(self): + self.assertEqual(extract_duration("10 ثانیه"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 دقیقه"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 ساعت"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 روز"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 هفته"), + (timedelta(weeks=25), "")) + self.assertEqual(extract_duration("هفت ساعت"), + (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 ثانیه"), + (timedelta(seconds=7.5), "")) + self.assertEqual(extract_duration("هشت و نیم روز و " + "سی و نه ثانیه"), + (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("یک تایمر برای نیم ساعت دیگه بزار"), + (timedelta(minutes=30), "یک تایمر برای دیگه بزار")) + self.assertEqual(extract_duration("چهار و نیم دقیقه تا " + "طلوع آفتاب"), + (timedelta(minutes=4.5), "تا طلوع آفتاب")) + self.assertEqual(extract_duration("این فیلم یک ساعت و پنجاه و هفت و نیم دقیقه " + "طول می کشد"), + (timedelta(hours=1, minutes=57.5), + "این فیلم طول می کشد")) + def test_extractdatetime_en(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("الان ساعت اینه", + "2017-06-27 13:04:00", "ساعت اینه") + testExtract("یک ثانیه دیگه", + "2017-06-27 13:04:01", "") + testExtract("یک دقیقه دیگه", + "2017-06-27 13:05:00", "") + testExtract("دو دقیقه دیگه", + "2017-06-27 13:06:00", "") + testExtract("دو ساعت دیگه", + "2017-06-27 15:04:00", "") + testExtract("من یک ساعت دیگه می خوامش", + "2017-06-27 14:04:00", "من می خوامش") + testExtract("1 ثانیه دیگه", + "2017-06-27 13:04:01", "") + testExtract("2 ثانیه دیگه", + "2017-06-27 13:04:02", "") + testExtract("یک آلارم برای یک دقیقه بعد بزار", + "2017-06-27 13:05:00", "یک آلارم برای بزار") + testExtract("یک آلارم برای نیم ساعت دیگه بزار", + "2017-06-27 13:34:00", "یک آلارم برای بزار") + testExtract("یه آلارم برای پنج روز بعد بزار", + "2017-07-02 00:00:00", "یه آلارم برای بزار") + testExtract("پس فردا", + "2017-06-29 00:00:00", "") + testExtract("آب و هوا پس فردا چطوره؟", + "2017-06-29 00:00:00", "آب و هوا چطوره؟") + #testExtract("ساعت بیست و دو و چهل و پنج دقیقه بهم یادآوری کن", + # "2017-06-27 22:45:00", "بهم یادآوری کن") + testExtract("هوای جمعه صبح چطوره؟", + "2017-06-30 08:00:00", "هوای چطوره؟") + testExtract("هوای فردا چطوره؟", + "2017-06-28 00:00:00", "هوای چطوره؟") + testExtract("هوای امروز بعد از ظهر چطوره؟", + "2017-06-27 15:00:00", "هوای چطوره؟") + testExtract("یادم بنداز که هشت هفته و دو روز دیگه به مادرم زنگ بزنم", + "2017-08-24 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") + #testExtract("یادم بنداز که دوازده مرداد به مادرم زنگ بزنم", + # "2017-08-03 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") + #testExtract("یادم بنداز که ساعت هفت به مادرم زنگ بزنم", + # "2017-06-28 07:00:00", "یادم بنداز که به مادرم زنگ بزنم") + #testExtract("یادم بنداز که فردا ساعت بیست و دو به مادرم زنگ بزنم", + # "2017-06-28 22:00:00", "یادم بنداز که به مادرم زنگ بزنم") + # TODO: This test is imperfect due to the "at 7:00" still in the + # remainder. But let it pass for now since time is correct + + def test_multiple_numbers(self): + self.assertEqual(extract_numbers("یک دو سه"), + [1.0, 2.0, 3.0]) + self.assertEqual(extract_numbers("ده بیست سه پونزده هزار و شصت و شونزده"), + [10, 20, 3, 15060, 16]) + + + + +if __name__ == "__main__": + unittest.main() From eca985cbbe703ad80c2c15cd58ade5c7b1bd7e32 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Tue, 27 Apr 2021 19:26:49 -0700 Subject: [PATCH 2/2] address review, clean up, fix missing loc code --- lingua_franca/internal.py | 1 + lingua_franca/lang/format_fa.py | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index 7b996c2f..cf7af73e 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -22,6 +22,7 @@ 'de': 'de-de', 'en': 'en-us', 'es': 'es-es', + 'fa': 'fa-ir', 'fr': 'fr-fr', 'hu': 'hu-hu', 'it': 'it-it', diff --git a/lingua_franca/lang/format_fa.py b/lingua_franca/lang/format_fa.py index 10f944a7..04a42757 100644 --- a/lingua_franca/lang/format_fa.py +++ b/lingua_franca/lang/format_fa.py @@ -34,17 +34,13 @@ class NumberVariantFA(IntEnum): "formal": NumberVariantFA.FORMAL, }) -def _applyVariant(text, variant): +def _apply_number_variant(text, variant): if variant == NumberVariantFA.FORMAL: - print("Doing") - print(text) for key, value in _FORMAL_VARIANT.items(): - print("xxx "+value+" "+key) text = text.replace(value, key) - print(text) return text -def _handleVariant(func): +def _handle_number_variant(func): @wraps(func) @lookup_variant({ @@ -54,14 +50,13 @@ def _handleVariant(func): }) def wrapper(*args, **kwargs): result = func(*args, **kwargs) - print(kwargs, result) if 'variant' in kwargs: - return _applyVariant(result, kwargs['variant']) + return _apply_number_variant(result, kwargs['variant']) else: return result return wrapper -@_handleVariant +@_handle_number_variant def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None): """ Farsi helper for nice_number @@ -188,7 +183,7 @@ def _to_cardinal(number, places): return _fractional(y, l) return _cardinalPos(x) + _FARSI_SEPERATOR + _fractional(y, l) -@_handleVariant +@_handle_number_variant def pronounce_number_fa(number, places=2, scientific=False, ordinals=False, variant=None): """ @@ -227,7 +222,7 @@ def pronounce_number_fa(number, places=2, scientific=False, return _to_ordinal(number) return _to_cardinal(number, places) -@_handleVariant +@_handle_number_variant def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): """ Format a time to a comfortable human format