diff --git a/lingua_franca/lang/common_data_de.py b/lingua_franca/lang/common_data_de.py index abb9becb..ab46c576 100644 --- a/lingua_franca/lang/common_data_de.py +++ b/lingua_franca/lang/common_data_de.py @@ -1,68 +1,12 @@ -_DE_NUMBERS = { - 'null': 0, - 'ein': 1, - 'eins': 1, - 'eine': 1, - 'einer': 1, - 'einem': 1, - 'einen': 1, - 'eines': 1, - 'zwei': 2, - 'drei': 3, - 'vier': 4, - 'fünf': 5, - 'sechs': 6, - 'sieben': 7, - 'acht': 8, - 'neun': 9, - 'zehn': 10, - 'elf': 11, - 'zwölf': 12, - 'dreizehn': 13, - 'vierzehn': 14, - 'fünfzehn': 15, - 'sechzehn': 16, - 'siebzehn': 17, - 'achtzehn': 18, - 'neunzehn': 19, - 'zwanzig': 20, - 'einundzwanzig': 21, - 'zweiundzwanzig': 22, - 'dreiundzwanzig': 23, - 'vierundzwanzig': 24, - 'fünfundzwanzig': 25, - 'sechsundzwanzig': 26, - 'siebenundzwanzig': 27, - 'achtundzwanzig': 28, - 'neunundzwanzig': 29, - 'dreißig': 30, - 'einunddreißig': 31, - 'vierzig': 40, - 'fünfzig': 50, - 'sechzig': 60, - 'siebzig': 70, - 'achtzig': 80, - 'neunzig': 90, - 'hundert': 100, - 'zweihundert': 200, - 'dreihundert': 300, - 'vierhundert': 400, - 'fünfhundert': 500, - 'sechshundert': 600, - 'siebenhundert': 700, - 'achthundert': 800, - 'neunhundert': 900, - 'tausend': 1000, - 'million': 1000000 -} +from collections import OrderedDict +from lingua_franca.lang.parse_common import invert_dict -_MONTHS_DE = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', - 'juli', 'august', 'september', 'oktober', 'november', - 'dezember'] +_ARTICLES = {'der', 'das', 'die', 'dem', 'den'} -_NUM_STRING_DE = { +#_SPOKEN_NUMBER +_NUM_STRING = { 0: 'null', - 1: 'ein', # ein Viertel etc., nicht eins Viertel + 1: 'eins', 2: 'zwei', 3: 'drei', 4: 'vier', @@ -89,20 +33,44 @@ 70: 'siebzig', 80: 'achtzig', 90: 'neunzig', - 100: 'hundert' + 100: 'hundert', + 200: 'zweihundert', + 300: 'dreihundert', + 400: 'vierhundert', + 500: 'fünfhundert', + 600: 'sechshundert', + 700: 'siebenhundert', + 800: 'achthundert', + 900: 'neunhundert', + 1000: 'tausend', + 1000000: 'million' } +_STRING_NUM = invert_dict(_NUM_STRING) +_STRING_NUM.update({ + 'ein': 1, + 'eine': 1, + 'einer': 1, + 'eines': 1, + 'einem': 1, + 'einen': 1 +}) + +_MONTHS = ['januar', 'februar', 'märz', 'april', 'mai', 'juni', + 'juli', 'august', 'september', 'oktober', 'november', + 'dezember'] + # German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales # Currently, numbers are limited to 1000000000000000000000000, # but _NUM_POWERS_OF_TEN can be extended to include additional number words -_NUM_POWERS_OF_TEN_DE = [ +_NUM_POWERS_OF_TEN = [ '', 'tausend', 'Million', 'Milliarde', 'Billion', 'Billiarde', 'Trillion', 'Trilliarde' ] -_FRACTION_STRING_DE = { +_FRACTION_STRING = { 2: 'halb', 3: 'drittel', 4: 'viertel', @@ -124,6 +92,16 @@ 20: 'zwanzigstel' } +_STRING_FRACTION = invert_dict(_FRACTION_STRING) +_STRING_FRACTION.update({ + 'halb': 2, + 'halbe': 2, + 'halben': 2, + 'halbes': 2, + 'halber': 2, + 'halbem': 2 +}) + # Numbers below 1 million are written in one word in German, yielding very # long words # In some circumstances it may better to seperate individual words @@ -132,4 +110,121 @@ # Set _EXTRA_SPACE_DA="" for correct spelling, this is standard # _EXTRA_SPACE_DA = " " -_EXTRA_SPACE_DE = "" +_EXTRA_SPACE = "" + +_ORDINAL_BASE = { + "1.": "erst", + "2.": "zweit", + "3.": "dritt", + "4.": "viert", + "5.": "fünft", + "6.": "sechst", + "7.": "siebt", + "8.": "acht", + "9.": "neunt", + "10.": "zehnt", + "11.": "elft", + "12.": "zwölft", + "13.": "dreizehnt", + "14.": "vierzehnt", + "15.": "fünfzehnt", + "16.": "sechzehnt", + "17.": "siebzehnt", + "18.": "achtzehnt", + "19.": "neunzehnt", + "20.": "zwanzigst", + "21.": "einundzwanzigst", + "22.": "zweiundzwanzigst", + "23.": "dreiundzwanzigst", + "24.": "vierundzwanzigst", + "25.": "fünfundzwanzigst", + "26.": "sechsundzwanzigst", + "27.": "siebenundzwanzigst", + "28.": "achtundzwanzigst", + "29.": "neunundzwanzigst", + "30.": "dreißigst", + "31.": "einunddreißigst", + "32.": "zweiunddreißigst", + "33.": "dreiunddreißigst", + "34.": "vierunddreißigst", + "35.": "fünfunddreißigst", + "36.": "sechsunddreißigst", + "37.": "siebenunddreißigst", + "38.": "achtunddreißigst", + "39.": "neununddreißigst", + "40.": "vierzigst", + "41.": "einundvierzigst", + "42.": "zweiundvierzigst", + "43.": "dreiundvierzigst", + "44.": "vierundvierzigst", + "45.": "fünfundvierzigst", + "46.": "sechsundvierzigst", + "47.": "siebenundvierzigst", + "48.": "achtundvierzigst", + "49.": "neunundvierzigst", + "50.": "fünfzigst", + "51.": "einundfünfzigst", + "52.": "zweiundfünfzigst", + "53.": "dreiundfünfzigst", + "60.": "sechzigst", + "70.": "siebzigst", + "80.": "achtzigst", + "90.": "neunzigst", + "100.": "einhundertst", + "1000.": "eintausendst", + "1000000.": "millionst" + } + +_LONG_SCALE = OrderedDict([ + (100, 'hundert'), + (1000, 'tausend'), + (1000000, 'million'), + (1e9, "milliarde"), + (1e12, 'billion'), + (1e15, "billiarde"), + (1e18, "trillion"), + (1e21, "trilliarde"), + (1e24, "quadrillion"), + (1e27, "quadrilliarde") +]) + +_MULTIPLIER = set(_LONG_SCALE.values()) + +_STRING_LONG_SCALE = invert_dict(_LONG_SCALE) + +# ending manipulation +for number, item in _LONG_SCALE.items(): + if int(number) > 1000: + if item.endswith('e'): + name = item + 'n' + _MULTIPLIER.add(name) + _STRING_LONG_SCALE[name] = number + else: + name = item + 'en' + _MULTIPLIER.add(name) + _STRING_LONG_SCALE[name] = number + +_LONG_ORDINAL = { + 1e6: "millionst", + 1e9: "milliardst", + 1e12: "billionst", + 1e15: "billiardst", + 1e18: "trillionst", + 1e21: "trilliardst", + 1e24: "quadrillionst", + 1e27: "quadrilliardst" +} + +_LONG_ORDINAL.update(_ORDINAL_BASE) + +# dict für erste, drittem, millionstes ... +_STRING_LONG_ORDINAL = {ord+ending: num for ord, num in invert_dict(_LONG_ORDINAL).items() + for ending in ("en", "em", "es", "er", "e")} + +_FRACTION_MARKER = set() + +_NEGATIVES = {"minus"} + +_NUMBER_CONNECTORS = {"und"} + +_COMMA = {"komma", "comma", "punkt"} diff --git a/lingua_franca/lang/format_de.py b/lingua_franca/lang/format_de.py index 6d70781e..35c4089b 100644 --- a/lingua_franca/lang/format_de.py +++ b/lingua_franca/lang/format_de.py @@ -14,11 +14,17 @@ # limitations under the License. # -from lingua_franca.lang.format_common import convert_to_mixed_fraction -from lingua_franca.lang.common_data_de import _EXTRA_SPACE_DE, \ - _FRACTION_STRING_DE, _MONTHS_DE, _NUM_POWERS_OF_TEN_DE, _NUM_STRING_DE from math import floor +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_de import ( + _EXTRA_SPACE, + _FRACTION_STRING, + _MONTHS, + _NUM_POWERS_OF_TEN, + _NUM_STRING +) + def nice_number_de(number, speech=True, denominators=range(1, 21)): """ German helper for nice_number @@ -44,7 +50,7 @@ def nice_number_de(number, speech=True, denominators=range(1, 21)): return '{} {}/{}'.format(whole, num, den) if num == 0: return str(whole) - den_str = _FRACTION_STRING_DE[den] + den_str = _FRACTION_STRING[den] if whole == 0: if num == 1: return_string = 'ein {}'.format(den_str) @@ -85,24 +91,27 @@ def pronounce_triplet_de(num): if num > 99: hundreds = floor(num / 100) if hundreds > 0: - result += _NUM_STRING_DE[ - hundreds] + _EXTRA_SPACE_DE + 'hundert' + _EXTRA_SPACE_DE + number = _NUM_STRING[hundreds] if hundreds > 1 else "ein" + result += number + 'hundert' + _EXTRA_SPACE num -= hundreds * 100 if num == 0: result += '' # do nothing elif num == 1: result += 'eins' # need the s for the last digit elif num <= 20: - result += _NUM_STRING_DE[num] # + _EXTRA_SPACE_DA + result += _NUM_STRING[num] # + _EXTRA_SPACE_DA elif num > 20: ones = num % 10 tens = num - ones if ones > 0: - result += _NUM_STRING_DE[ones] + _EXTRA_SPACE_DE + number = _NUM_STRING[ones] + if ones == 1 and tens > 0: # eins > ein + number = number[:-1] + result += number + _EXTRA_SPACE if tens > 0: - result += 'und' + _EXTRA_SPACE_DE + result += 'und' + _EXTRA_SPACE if tens > 0: - result += _NUM_STRING_DE[tens] + _EXTRA_SPACE_DE + result += _NUM_STRING[tens] + _EXTRA_SPACE return result def pronounce_fractional_de(num, @@ -112,7 +121,7 @@ def pronounce_fractional_de(num, place = 10 while places > 0: # doesn't work with 1.0001 and places = 2: int( # number*place) % 10 > 0 and places > 0: - result += " " + _NUM_STRING_DE[int(num * place) % 10] + result += " " + _NUM_STRING[int(num * place) % 10] if int(num * place) % 10 == 1: result += 's' # "1" is pronounced "eins" after the decimal # point @@ -135,18 +144,18 @@ def pronounce_whole_number_de(num, scale_level=0): else: result += "eins" elif scale_level == 1: - result += 'ein' + _EXTRA_SPACE_DE + 'tausend' + _EXTRA_SPACE_DE + result += 'ein' + _EXTRA_SPACE + 'tausend' + _EXTRA_SPACE else: - result += "eine " + _NUM_POWERS_OF_TEN_DE[scale_level] + ' ' + result += "eine " + _NUM_POWERS_OF_TEN[scale_level] + ' ' elif last_triplet > 1: result += pronounce_triplet_de(last_triplet) if scale_level == 1: # result += _EXTRA_SPACE_DA - result += 'tausend' + _EXTRA_SPACE_DE + result += 'tausend' + _EXTRA_SPACE if scale_level >= 2: # if _EXTRA_SPACE_DA == '': # result += " " - result += " " + _NUM_POWERS_OF_TEN_DE[scale_level] + result += " " + _NUM_POWERS_OF_TEN[scale_level] if scale_level >= 2: if scale_level % 2 == 0: result += "e" # MillionE @@ -161,7 +170,7 @@ def pronounce_whole_number_de(num, scale_level=0): if abs(number) >= 1000000000000000000000000: # cannot do more than this return str(number) elif number == 0: - return str(_NUM_STRING_DE[0]) + return str(_NUM_STRING[0]) elif number < 0: return "minus " + pronounce_number_de(abs(number), places) else: @@ -278,7 +287,7 @@ def nice_response_de(text): words = text.split() for idx, word in enumerate(words): - if word.lower() in _MONTHS_DE: + if word.lower() in _MONTHS: text = _nice_ordinal_de(text) if word == '^': @@ -300,7 +309,7 @@ def _nice_ordinal_de(text, speech=True): wordPrev = words[idx - 1] if idx > 0 else "" if word[-1:] == ".": if word[:-1].isdecimal(): - if wordNext.lower() in _MONTHS_DE: + if wordNext.lower() in _MONTHS: word = pronounce_ordinal_de(int(word[:-1])) if wordPrev.lower() in ["am", "dem", "vom", "zum", "(vom", "(am", "zum"]: diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 95fda48e..0e479c83 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -13,77 +13,288 @@ # See the License for the specific language governing permissions and # limitations under the License. # + import re +import json from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta -from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ - extract_numbers_generic, Normalizer -from lingua_franca.lang.common_data_de import _DE_NUMBERS -from lingua_franca.lang.format_de import pronounce_number_de -from lingua_franca.time import now_local - - -de_numbers = { - 'null': 0, - 'ein': 1, - 'eins': 1, - 'eine': 1, - 'einer': 1, - 'einem': 1, - 'einen': 1, - 'eines': 1, - 'zwei': 2, - 'drei': 3, - 'vier': 4, - 'fünf': 5, - 'sechs': 6, - 'sieben': 7, - 'acht': 8, - 'neun': 9, - 'zehn': 10, - 'elf': 11, - 'zwölf': 12, - 'dreizehn': 13, - 'vierzehn': 14, - 'fünfzehn': 15, - 'sechzehn': 16, - 'siebzehn': 17, - 'achtzehn': 18, - 'neunzehn': 19, - 'zwanzig': 20, - 'einundzwanzig': 21, - 'zweiundzwanzig': 22, - 'dreiundzwanzig': 23, - 'vierundzwanzig': 24, - 'fünfundzwanzig': 25, - 'sechsundzwanzig': 26, - 'siebenundzwanzig': 27, - 'achtundzwanzig': 28, - 'neunundzwanzig': 29, - 'dreißig': 30, - 'einunddreißig': 31, - 'vierzig': 40, - 'fünfzig': 50, - 'sechzig': 60, - 'siebzig': 70, - 'achtzig': 80, - 'neunzig': 90, - 'hundert': 100, - 'zweihundert': 200, - 'dreihundert': 300, - 'vierhundert': 400, - 'fünfhundert': 500, - 'sechshundert': 600, - 'siebenhundert': 700, - 'achthundert': 800, - 'neunhundert': 900, - 'tausend': 1000, - 'million': 1000000 -} - -# TODO: short_scale and ordinals don't do anything here. -# The parameters are present in the function signature for API compatibility -# reasons. + +from lingua_franca.lang.parse_common import ( + ReplaceableNumber, + Normalizer, + Token, + look_for_fractions, + tokenize, +) +from lingua_franca.lang.common_data_de import ( + _STRING_NUM, + _STRING_FRACTION, + _STRING_LONG_ORDINAL, + _STRING_LONG_SCALE, + _MULTIPLIER, + _NEGATIVES, + _NUMBER_CONNECTORS, + _COMMA, + _ARTICLES +) +from lingua_franca.time import now_local, DAYS_IN_1_YEAR, DAYS_IN_1_MONTH +from lingua_franca.internal import resolve_resource_file + + + +def _convert_words_to_numbers_de(text, short_scale=False, + ordinals=False, fractions=True): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numberres should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + Returns: + str + The original text, with numbers subbed in where appropriate. + """ + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_de(tokens, short_scale, ordinals, fractions) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_de(tokens, short_scale=True, + ordinals=False, fractions=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_de(tokens, short_scale, + ordinals) + + if not to_replace: + break + + if isinstance(to_replace.value, float) and not fractions: + pass + else: + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_de(tokens, short_scale=True, + ordinals=False): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_de_helper(tokens, short_scale, + ordinals) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_de_helper(tokens, + short_scale, ordinals): + """ + Helper for _extract_number_with_text_de. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + Returns: + int or float, [Tokens] + """ + if ordinals: + for token in tokens: + ordinal = is_ordinal_de(token.word) + if ordinal: + return ordinal, [token] + + return _extract_real_number_with_text_de(tokens, short_scale) + + +def _extract_real_number_with_text_de(tokens, short_scale): + """ + This is handling real numbers. + + Args: + tokens [Token]: + short_scale boolean: + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + """ + number_words = [] + val = _val = _current_val = None + _comma = False + to_sum = [] + + for idx, token in enumerate(tokens): + + _prev_val = _current_val + _current_val = None + + word = token.word + + if word in _NUMBER_CONNECTORS and not number_words: + continue + if word in (_NEGATIVES | _NUMBER_CONNECTORS | _COMMA): + number_words.append(token) + if word in _COMMA: + _comma = token + _current_val = _val or _prev_val + continue + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + if word not in _STRING_LONG_SCALE and \ + word not in _STRING_NUM and \ + word not in _MULTIPLIER and \ + not is_numeric_de(word) and \ + not is_fractional_de(word): + words_only = [token.word for token in number_words] + if _val is not None: + to_sum.append(_val) + if to_sum: + val = sum(to_sum) + + if number_words and (not all([w in _ARTICLES | _NEGATIVES + | _NUMBER_CONNECTORS for w in words_only]) + or str(val) == number_words[-1].word): + break + else: + number_words.clear() + to_sum.clear() + val = _val = _prev_val = None + continue + elif word not in _MULTIPLIER \ + and prev_word not in _MULTIPLIER \ + and prev_word not in _NUMBER_CONNECTORS \ + and prev_word not in _NEGATIVES \ + and prev_word not in _COMMA \ + and prev_word not in _STRING_LONG_SCALE \ + and prev_word not in _STRING_NUM \ + and not is_ordinal_de(word) \ + and not is_numeric_de(prev_word) \ + and not is_fractional_de(prev_word): + number_words = [token] + else: + number_words.append(token) + + # is this word already a number or a word of a number? + _val = _current_val = is_number_de(word) + + # is this a negative number? + if _current_val is not None and prev_word in _NEGATIVES: + _val = 0 - _current_val + + # is the prev word a number and should we multiply it? + if _prev_val is not None and ( word in _MULTIPLIER or \ + word in ("einer", "eines", "einem")): + to_sum.append(_prev_val * _current_val or _current_val) + _val = _current_val = None + + # fraction handling + _fraction_val = is_fractional_de(word, short_scale=short_scale) + if _fraction_val: + if _prev_val is not None and prev_word != "eine" and \ + word not in _STRING_FRACTION: # zusammengesetzter Bruch + _val = _prev_val + _fraction_val + if prev_word not in _NUMBER_CONNECTORS and tokens[idx -1] not in number_words: + number_words.append(tokens[idx - 1]) + elif _prev_val is not None: + _val = _prev_val * _fraction_val + if tokens[idx -1] not in number_words: + number_words.append(tokens[idx - 1]) + else: + _val = _fraction_val + _current_val = _val + + # directly following numbers without relation + if (is_numeric_de(prev_word) or prev_word in _STRING_NUM) \ + and not _fraction_val and not is_fractional_de(next_word) and not to_sum: + val = _prev_val + number_words.pop(-1) + break + + # is this a spoken time ("drei viertel acht") + if isinstance(_prev_val, float) and is_number_de(word) and not to_sum: + if idx+1 < len(tokens): + _, number = _extract_real_number_with_text_de([tokens[idx + 1]], + short_scale=short_scale) + if not next_word or not number: + val = f"{_val-1}:{int(60*_prev_val)}" + break + + # spoken decimals + if _current_val is not None and _comma: + # to_sum = [ 1, 0.2, 0.04,...] + to_sum.append(_current_val if _current_val >= 10 else ( + _current_val) / (10 ** (token.index - _comma.index))) + _val = _current_val = None + + + if _current_val is not None and next_word in (_NUMBER_CONNECTORS | _COMMA | {""}): + to_sum.append(_val or _current_val) + _val = _current_val = None + + + if not next_word and number_words: + val = sum(to_sum) or _val + + return val, number_words def extract_duration_de(text): @@ -122,10 +333,9 @@ def extract_duration_de(text): } # Einzahl und Mehrzahl - pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" + pattern = r"(?:^|\s)(?P\d+(?:[.,]?\d+)?\b)(?:\s+|\-)(?P{unit}[nes]?[sn]?\b)" - # TODO Einstiegspunkt für Text-zu-Zahlen Konversion - #text = _convert_words_to_numbers_de(text) + text = _convert_words_to_numbers_de(text) for (unit_en, unit_de) in time_units.items(): unit_pattern = pattern.format( @@ -133,7 +343,8 @@ def extract_duration_de(text): time_units[unit_en] = 0 def repl(match): - time_units[unit_en] += float(match.group(1)) + value = match.group("value").replace(",",".") + time_units[unit_en] += float(value) return '' text = re.sub(unit_pattern, repl, text) @@ -143,91 +354,6 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): - """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' - - """ - # TODO: short_scale and ordinals don't do anything here. - # The parameters are present in the function signature for API compatibility - # reasons. - text = text.lower() - aWords = text.split() - aWords = [word for word in aWords if - word not in ["der", "die", "das", "des", "den", "dem"]] - and_pass = False - valPreAnd = False - val = False - count = 0 - while count < len(aWords): - word = aWords[count] - if is_numeric(word): - # if word.isdigit(): # doesn't work with decimals - val = float(word) - elif is_fractional_de(word): - val = is_fractional_de(word) - elif is_ordinal_de(word): - val = is_ordinal_de(word) - else: - if word in _DE_NUMBERS: - val = _DE_NUMBERS[word] - if count < (len(aWords) - 1): - wordNext = aWords[count + 1] - else: - wordNext = "" - valNext = is_fractional_de(wordNext) - - if valNext: - val = val * valNext - aWords[count + 1] = "" - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - elif and_pass: - # added to value, quit here - val = valPreAnd - break - else: - count += 1 - continue - - aWords[count] = "" - - if and_pass: - aWords[count - 1] = '' # remove "and" - val += valPreAnd - elif count + 1 < len(aWords) and aWords[count + 1] == 'und': - and_pass = True - valPreAnd = val - val = False - count += 2 - continue - elif count + 2 < len(aWords) and aWords[count + 2] == 'und': - and_pass = True - valPreAnd = val - val = False - count += 3 - continue - - break - - return val or False - - def extract_datetime_de(text, anchorDate=None, default_time=None): def clean_string(s): """ @@ -238,17 +364,16 @@ def clean_string(s): for 12 hour date format """ - s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ - .replace(' der ', ' ').replace(' den ', ' ').replace(' an ', - ' ').replace( - ' am ', ' ') \ - .replace(' auf ', ' ').replace(' um ', ' ') + s = _convert_words_to_numbers_de(s) + s = s.lower().replace('?', '').replace(' der ', ' ').replace(' den ', ' ')\ + .replace(' an ', ' ').replace(' am ', ' ').replace(' auf ', ' ')\ + .replace(' um ', ' ') wordList = s.split() for idx, word in enumerate(wordList): - if is_ordinal_de(word) is not False: - word = str(is_ordinal_de(word)) - wordList[idx] = word + ordinal = _get_ordinal_index(word) + if ordinal: + wordList[idx] = ordinal return wordList @@ -280,8 +405,10 @@ def date_found(): timeQualifier = "" timeQualifiersList = ['früh', 'morgens', 'vormittag', 'vormittags', - 'nachmittag', 'nachmittags', 'abend', 'abends', - 'nachts'] + 'mittag', 'mittags', 'nachmittag', 'nachmittags', + 'abend', 'abends', 'nacht', 'nachts', 'pm', 'p.m.'] + eveningQualifiers = ['nachmittag', 'nachmittags', 'abend', 'abends', 'nacht', + 'nachts', 'pm', 'p.m.'] markers = ['in', 'am', 'gegen', 'bis', 'für'] days = ['montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'samstag', 'sonntag'] @@ -316,15 +443,6 @@ def date_found(): wordNext = words[idx + 1] if idx + 1 < len(words) else "" wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" - # this isn't in clean string because I don't want to save back to words - - if word != 'morgen' and word != 'übermorgen': - if word[-2:] == "en": - word = word[:-2] # remove en - if word != 'heute': - if word[-1:] == "e": - word = word[:-1] # remove plural for most nouns - start = idx used = 0 # save timequalifier for later @@ -343,14 +461,16 @@ def date_found(): dayOffset = 2 used += 1 # parse 5 days, 10 weeks, last week, next week - elif word == "tag" or word == "tage": - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) + elif word[:3] == "tag" and len(word) <= 5: + num = is_number_de(wordPrev) + if num: + dayOffset += num start -= 1 used = 2 - elif word == "woch" and not fromFlag: - if wordPrev[0].isdigit(): - dayOffset += int(wordPrev) * 7 + elif word[:5] == "woche" and len(word) <= 7 and not fromFlag: + num = is_number_de(wordPrev) + if num: + dayOffset += num * 7 start -= 1 used = 2 elif wordPrev[:6] == "nächst": @@ -362,9 +482,10 @@ def date_found(): start -= 1 used = 2 # parse 10 months, next month, last month - elif word == "monat" and not fromFlag: - if wordPrev[0].isdigit(): - monthOffset = int(wordPrev) + elif word[:5] == "monat" and len(word) <= 7 and not fromFlag: + num = is_number_de(wordPrev) + if num: + monthOffset = num start -= 1 used = 2 elif wordPrev[:6] == "nächst": @@ -376,9 +497,10 @@ def date_found(): start -= 1 used = 2 # parse 5 years, next year, last year - elif word == "jahr" and not fromFlag: - if wordPrev[0].isdigit(): - yearOffset = int(wordPrev) + elif word[:4] == "jahr" and len(word) <= 6 and not fromFlag: + num = is_number_de(wordPrev) + if num: + yearOffset = num start -= 1 used = 2 elif wordPrev[:6] == "nächst": @@ -528,16 +650,14 @@ def date_found(): hrAbs = 19 used += 1 # parse half an hour, quarter hour - elif word == "stunde" and \ + elif word[:5] == "nacht": + if not hrAbs: + hrAbs = 23 + used += 1 + elif word[:6] == "stunde" and \ (wordPrev in markers or wordPrevPrev in markers): - if wordPrev[:4] == "halb": - minOffset = 30 - elif wordPrev == "viertel": - minOffset = 15 - elif wordPrev == "dreiviertel": - minOffset = 45 - else: - hrOffset = 1 + factor = is_number_de(word) or 1 + minOffset = 60 * factor if wordPrevPrev in markers: words[idx - 2] = "" words[idx - 1] = "" @@ -549,6 +669,7 @@ def date_found(): isTime = True strHH = "" strMM = "" + timeQualifier = "" remainder = "" if ':' in word: # parse colons @@ -575,51 +696,12 @@ def date_found(): break if remainder == "": nextWord = wordNext.replace(".", "") - if nextWord == "am" or nextWord == "pm": - remainder = nextWord - used += 1 - elif nextWord == "abends": - remainder = "pm" - used += 1 - elif wordNext == "am" and wordNextNext == "morgen": - remainder = "am" - used += 2 - elif wordNext == "am" and wordNextNext == "nachmittag": - remainder = "pm" - used += 2 - elif wordNext == "am" and wordNextNext == "abend": - remainder = "pm" - used += 2 - elif wordNext == "morgens": - remainder = "am" + if nextWord in eveningQualifiers: used += 1 - elif wordNext == "nachmittags": - remainder = "pm" + timeQualifier = "pm" + elif nextWord in timeQualifiersList: used += 1 - elif wordNext == "abends": - remainder = "pm" - used += 1 - elif wordNext == "heute" and wordNextNext == "morgen": - remainder = "am" - used = 2 - elif wordNext == "heute" and wordNextNext == "nachmittag": - remainder = "pm" - used = 2 - elif wordNext == "heute" and wordNextNext == "abend": - remainder = "pm" - used = 2 - elif wordNext == "nachts": - if strHH > 4: - remainder = "pm" - else: - remainder = "am" - used += 1 - else: - if timeQualifier != "": - if strHH <= 12 and \ - (timeQualifier == "abends" or - timeQualifier == "nachmittags"): - strHH += 12 # what happens when strHH is 24? + timeQualifier = "am" else: # try to parse # s without colons # 5 hours, 10 minutes etc. @@ -633,7 +715,7 @@ def date_found(): remainder += word[i] if remainder == "": - remainder = wordNext.replace(".", "").lstrip().rstrip() + timeQualifier = wordNext.replace(".", "").lstrip().rstrip() if ( remainder == "pm" or @@ -641,7 +723,7 @@ def date_found(): remainder == "p.m." or wordNext == "p.m."): strHH = strNum - remainder = "pm" + timeQualifier = "pm" used = 1 elif ( remainder == "am" or @@ -649,26 +731,26 @@ def date_found(): remainder == "a.m." or wordNext == "a.m."): strHH = strNum - remainder = "am" + timeQualifier = "am" used = 1 else: - if wordNext == "stund" and int(word) < 100: + if wordNext[:6] == "stunde" and len(wordNext) <= 7: # "in 3 hours" - hrOffset = int(word) + hrOffset = is_number_de(word) or 1 used = 2 isTime = False hrAbs = -1 minAbs = -1 - elif wordNext == "minut": + elif wordNext[:6] == "minute" and len(wordNext) <= 7: # "in 10 minutes" - minOffset = int(word) + minOffset = is_number_de(word) or 1 used = 2 isTime = False hrAbs = -1 minAbs = -1 - elif wordNext == "sekund": + elif wordNext[:7] == "sekunde" and len(wordNext) <= 8: # in 5 seconds - secOffset = int(word) + secOffset = is_number_de(word) or 1 used = 2 isTime = False hrAbs = -1 @@ -678,96 +760,96 @@ def date_found(): strHH = word used += 1 isTime = True - if wordNextNext == timeQualifier: + if wordNextNext in timeQualifiersList: strMM = "" if wordNextNext[:10] == "nachmittag": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "nachmittag": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNextNext[:5] == "abend": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNextNext == "am" and wordNextNextNext == \ "abend": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNextNext[:7] == "morgens": used += 1 - remainder = "am" + timeQualifier = "am" elif wordNextNext == "am" and wordNextNextNext == \ "morgen": used += 2 - remainder = "am" - elif wordNextNext == "nachts": + timeQualifier = "am" + elif wordNextNext[:5] == "nacht": used += 1 if 8 <= int(word) <= 12: - remainder = "pm" + timeQualifier = "pm" else: - remainder = "am" + timeQualifier = "am" - elif is_numeric(wordNextNext): + elif is_numeric_de(wordNextNext): strMM = wordNextNext used += 1 if wordNextNextNext == timeQualifier: if wordNextNextNext[:10] == "nachmittag": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "nachmittag": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNextNextNext[:5] == "abend": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "abend": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNextNextNext[:7] == "morgens": used += 1 - remainder = "am" + timeQualifier = "am" elif wordNextNextNext == "am" and \ wordNextNextNextNext == "morgen": used += 2 - remainder = "am" + timeQualifier = "am" elif wordNextNextNext == "nachts": used += 1 if 8 <= int(word) <= 12: - remainder = "pm" + timeQualifier = "pm" else: - remainder = "am" + timeQualifier = "am" - elif wordNext == timeQualifier: + elif wordNext in timeQualifiersList: strHH = word strMM = 00 isTime = True if wordNext[:10] == "nachmittag": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNext == "am" and wordNextNext == "nachmittag": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNext[:5] == "abend": used += 1 - remainder = "pm" + timeQualifier = "pm" elif wordNext == "am" and wordNextNext == "abend": used += 2 - remainder = "pm" + timeQualifier = "pm" elif wordNext[:7] == "morgens": used += 1 - remainder = "am" + timeQualifier = "am" elif wordNext == "am" and wordNextNext == "morgen": used += 2 - remainder = "am" + timeQualifier = "am" elif wordNext == "nachts": used += 1 if 8 <= int(word) <= 12: - remainder = "pm" + timeQualifier = "pm" else: - remainder = "am" + timeQualifier = "am" # if timeQualifier != "": # military = True @@ -776,8 +858,14 @@ def date_found(): strHH = int(strHH) if strHH else 0 strMM = int(strMM) if strMM else 0 - strHH = strHH + 12 if remainder == "pm" and strHH < 12 else strHH - strHH = strHH - 12 if remainder == "am" and strHH >= 12 else strHH + if timeQualifier != "": + if strHH <= 12 and timeQualifier == "pm" and not \ + (strHH == 12 and any([q in words for q in ("pm", "p.m.")])): + if strHH == 12: + strHH = 0 + dayOffset +=1 + else: + strHH += 12 if strHH > 24 or strMM > 59: isTime = False used = 0 @@ -900,107 +988,112 @@ def date_found(): return [extractedDate, resultStr] -def is_fractional_de(input_str, short_scale=True): +def is_fractional_de(input_str, short_scale=False): """ This function takes the given text and checks if it is a fraction. - Args: input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False Returns: (bool) or (float): False if not a fraction, otherwise the fraction - """ - if input_str.lower().startswith("halb"): - return 0.5 - - if input_str.lower() == "drittel": - return 1.0 / 3 - elif input_str.endswith('tel'): - if input_str.endswith('stel'): - input_str = input_str[:len(input_str) - 4] # e.g. "hundertstel" - else: - input_str = input_str[:len(input_str) - 3] # e.g. "fünftel" - if input_str.lower() in _DE_NUMBERS: - return 1.0 / (_DE_NUMBERS[input_str.lower()]) + # account for different numerators, e.g. zweidrittel + + input_str = input_str.lower() + numerator = 1 + prev_number = 0 + denominator = False + remainder = "" + + # first check if is a fraction containing a char (eg "2/3") + _bucket = input_str.split('/') + if look_for_fractions(_bucket): + numerator = float(_bucket[0]) + denominator = float(_bucket[1]) + + if not denominator: + for fraction in sorted(_STRING_FRACTION.keys(), + key=lambda x: len(x), + reverse=True): + if fraction in input_str and not denominator: + denominator = _STRING_FRACTION.get(fraction) + remainder = input_str.replace(fraction, "") + break + + if remainder: + if not _STRING_NUM.get(remainder, False): + #acount for eineindrittel + for numstring, number in _STRING_NUM.items(): + if remainder.endswith(numstring): + prev_number = _STRING_NUM.get( + remainder.replace(numstring, "", 1), 0) + numerator = number + break + else: + return False + else: + numerator = _STRING_NUM.get(remainder) - return False + if denominator: + return prev_number + (numerator / denominator) + else: + return False def is_ordinal_de(input_str): """ This function takes the given text and checks if it is an ordinal number. - Args: input_str (str): the string to check if ordinal Returns: (bool) or (float): False if not an ordinal, otherwise the number corresponding to the ordinal - ordinals for 1, 3, 7 and 8 are irregular - - only works for ordinals corresponding to the numbers in _DE_NUMBERS - + only works for ordinals corresponding to the numbers in _STRING_NUM """ + val = _STRING_LONG_ORDINAL.get(input_str.lower(), False) + # account for numbered ordinals + if not val and input_str.endswith('.') and is_numeric_de(input_str[:-1]): + val = input_str + return val - lowerstr = input_str.lower() - - if lowerstr.startswith("erste"): - return 1 - if lowerstr.startswith("dritte"): - return 3 - if lowerstr.startswith("siebte"): - return 7 - if lowerstr.startswith("achte"): - return 8 - - if lowerstr[-3:] == "ste": # from 20 suffix is -ste* - lowerstr = lowerstr[:-3] - if lowerstr in _DE_NUMBERS: - return _DE_NUMBERS[lowerstr] - - if lowerstr[-4:] in ["ster", "stes", "sten", "stem"]: - lowerstr = lowerstr[:-4] - if lowerstr in _DE_NUMBERS: - return _DE_NUMBERS[lowerstr] - - if lowerstr[-2:] == "te": # below 20 suffix is -te* - lowerstr = lowerstr[:-2] - if lowerstr in _DE_NUMBERS: - return _DE_NUMBERS[lowerstr] - - if lowerstr[-3:] in ["ter", "tes", "ten", "tem"]: - lowerstr = lowerstr[:-3] - if lowerstr in _DE_NUMBERS: - return _DE_NUMBERS[lowerstr] - - return False +def _get_ordinal_index(input_str : str, type_: type = str): + ord = is_ordinal_de(input_str) + return type_(ord.replace(".","")) if ord else ord -def normalize_de(text, remove_articles=True): - """ German string normalization """ - # TODO return GermanNormalizer().normalize(text, remove_articles) - words = text.split() # this also removed extra spaces - normalized = "" - for word in words: - if remove_articles and word in ["der", "die", "das", "des", "den", - "dem"]: - continue - - # Expand common contractions, e.g. "isn't" -> "is not" - contraction = ["net", "nett"] - if word in contraction: - expansion = ["nicht", "nicht"] - word = expansion[contraction.index(word)] - # Convert numbers into digits, e.g. "two" -> "2" +def is_number_de(word: str): + if is_numeric_de(word): + if word.isdigit(): + return int(word) + else: + return float(word) + elif word in _STRING_NUM: + return _STRING_NUM.get(word) + elif word in _STRING_LONG_SCALE: + return _STRING_LONG_SCALE.get(word) + + return None - if word in _DE_NUMBERS: - word = str(_DE_NUMBERS[word]) +def is_numeric_de(input_str): + """ + Takes in a string and tests to see if it is a number. - normalized += " " + word + Args: + text (str): string to test if a number + Returns: + (bool): True if a number, else False + """ + # da float("1.") = 1.0 + if input_str.endswith('.'): + return False + try: + float(input_str) + return True + except ValueError: + return False - return normalized[1:] # strip the initial space def extract_numbers_de(text, short_scale=True, ordinals=False): @@ -1017,9 +1110,52 @@ def extract_numbers_de(text, short_scale=True, ordinals=False): Returns: list: list of extracted numbers as floats """ - return extract_numbers_generic(text, pronounce_number_de, extract_number_de, - short_scale=short_scale, ordinals=ordinals) + results = _extract_numbers_with_text_de(tokenize(text), + short_scale, ordinals) + # note: ordinal values return in form of "1." (castable into float) + values = [float(result.value) for result in results] + for i, val in enumerate(values): + if val.is_integer(): + values[i] = int(val) + + return values + + +def extract_number_de(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + numbers = _extract_numbers_with_text_de(tokenize(text.lower()), + short_scale, ordinals) + # if query ordinals only consider ordinals + if ordinals: + numbers = list(filter(lambda x: isinstance(x.value, str) + and x.value.endswith("."), + numbers)) + + number = numbers[0].value if numbers else None + + if number: + number = float(number) + if number.is_integer(): + number = int(number) + + return number class GermanNormalizer(Normalizer): - """ TODO implement language specific normalizer""" + with open(resolve_resource_file("text/de-de/normalize.json")) as f: + _default_config = json.load(f) + + +def normalize_de(text, remove_articles=True): + return GermanNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/res/text/de-de/normalize.json b/lingua_franca/res/text/de-de/normalize.json new file mode 100644 index 00000000..3ede9681 --- /dev/null +++ b/lingua_franca/res/text/de-de/normalize.json @@ -0,0 +1,79 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "am": "an dem", + "ans": "an das", + "aufs": "auf das", + "beim": "bei dem", + "durchs": "durch das", + "fürs": "für das", + "hinterm": "hinter dem", + "ins": "in das", + "übers": "über das", + "unters": "unter das", + "unterm": "unter dem", + "vom": "von dem", + "vors": "vor das", + "vorm": "vor dem", + "zum": "zu dem" + }, + "word_replacements": { + "m": "meter", + "km": "kilometer" + }, + "number_replacements": { + "null": "0", + "eins": "1", + "zwei": "2", + "drei": "3", + "vier": "4", + "fünf": "5", + "sechs": "6", + "sieben": "7", + "acht": "8", + "neun": "9", + "zehn": "10", + "elf": "11", + "zwölf": "12", + "dreizehn": "13", + "vierzehn": "14", + "fünfzehn": "15", + "sechzehn": "16", + "siebzehn": "17", + "achtzehn": "18", + "neunzehn": "19", + "zwanzig": "20", + "einundzwanzig": "21", + "zweiundzwanzig": "22", + "dreiundzwanzig": "23", + "vierundzwanzig": "24", + "fünfundzwanzig": "25", + "sechsundzwanzig": "26", + "siebenundzwanzig": "27", + "achtundzwanzig": "28", + "neunundzwanzig": "29", + "dreißig": "30", + "einunddreißig": "31", + "vierzig": "40", + "fünfzig": "50", + "sechtzig": "60", + "siebzig": "70", + "achtzig": "80", + "neunzig": "90" + }, + "stopwords": [], + "articles": [ + "der", + "die", + "das", + "dem", + "den", + "des" + ] + } diff --git a/test/unittests/test_parse_de.py b/test/unittests/test_parse_de.py index 18ed335a..d4d4702a 100644 --- a/test/unittests/test_parse_de.py +++ b/test/unittests/test_parse_de.py @@ -17,10 +17,14 @@ from datetime import datetime, time, timedelta from lingua_franca import load_language, unload_language, set_default_lang -from lingua_franca.parse import extract_datetime -from lingua_franca.parse import extract_duration -from lingua_franca.parse import extract_number -from lingua_franca.parse import normalize +from lingua_franca.parse import ( + extract_datetime, + extract_duration, + extract_number, + extract_numbers, + normalize +) +from lingua_franca.lang.parse_de import _convert_words_to_numbers_de def setUpModule(): @@ -33,55 +37,218 @@ def tearDownModule(): class TestNormalize(unittest.TestCase): + def test_articles(self): self.assertEqual( normalize("dies ist der test", lang="de-de", remove_articles=True), "dies ist test") self.assertEqual( normalize("und noch ein Test", lang="de-de", remove_articles=True), - "und noch 1 Test") + "und noch ein Test") self.assertEqual(normalize("dies ist der Extra-Test", lang="de-de", remove_articles=False), "dies ist der Extra-Test") - def test_extract_number(self): - self.assertEqual(extract_number("dies ist der 1. Test", - lang="de-de"), 1) - self.assertEqual(extract_number("dies ist der erste Test", - lang="de-de"), 1) - self.assertEqual(extract_number("dies ist 2 Test", lang="de-de"), 2) - self.assertEqual(extract_number("dies ist zweiter Test", lang="de-de"), - 2) + def test_spaces(self): + self.assertEqual(normalize(" dies ist ein test", lang="de-de"), + "dies ist ein test") + self.assertEqual(normalize(" dies ist ein test ", + lang="de-de"), "dies ist ein test") + + def test_numbers(self): + self.assertEqual( + normalize("dies ist eins zwei drei test", lang="de-de"), + "dies ist 1 2 3 test") self.assertEqual( - extract_number("dies ist der dritte Test", lang="de-de"), 3) + normalize("es ist vier fünf sechs test", lang="de-de"), + "es ist 4 5 6 test") + self.assertEqual( + normalize("es ist sieben acht neun test", lang="de-de"), + "es ist 7 8 9 test") + self.assertEqual( + normalize("es ist sieben acht neun test", lang="de-de"), + "es ist 7 8 9 test") + self.assertEqual( + normalize("dies ist zehn elf zwölf test", lang="de-de"), + "dies ist 10 11 12 test") self.assertEqual( - extract_number("dies ist der Test Nummer 4", lang="de-de"), 4) + normalize("dies ist dreizehn vierzehn test", lang="de-de"), + "dies ist 13 14 test") + self.assertEqual( + normalize("dies ist fünfzehn sechzehn siebzehn", lang="de-de"), + "dies ist 15 16 17") + self.assertEqual( + normalize("dies ist achtzehn neunzehn zwanzig", lang="de-de"), + "dies ist 18 19 20") + + +class TestExtractNumber(unittest.TestCase): + + def test_extract_number(self): + + self.assertEqual(extract_number("dies ist der 1. von 100 Tests"), 100) + + self.assertEqual(extract_number("dies ist Test 1.5"), 1.5) + + self.assertEqual(extract_number("dies ist 2 Test"), 2) + + self.assertEqual(extract_number("dies ist Test 3 von 4"), 3) + + self.assertEqual(extract_number("dies ist der Test Nummer 4"), 4) + + self.assertEqual(extract_number("drei Tassen"), 3) + + self.assertEqual(extract_number("dreißig tausend vier hundert zwanzig"), + 30420) + + self.assertEqual(extract_number("dreißig tausend vier hundert zwanzig komma zwei"), + 30420.2) + + self.assertEqual(extract_number("billionen jahre später"), + 1000000000000) + + self.assertEqual(extract_number("frisörtermin in einer stunde"), 1) + + self.assertEqual(extract_number("frisörtermin in 1 stunde"), 1) + + self.assertEqual(extract_number("minus dreißig grad"), -30) + + self.assertEqual(extract_number("das macht null sinn"), 0) + + self.assertEqual(extract_number("eins punkt drei vier"), 1.34) + + self.assertEqual(extract_number("eins komma drei vier"), 1.34) + + self.assertEqual(extract_number("null komma eins"), 0.1) + + self.assertEqual(extract_number("null komma nichts"), 0) + + self.assertEqual(extract_number("eins komma null vier"), 1.04) + + # Brüche self.assertEqual(extract_number("ein drittel einer Tasse", lang="de-de"), 1.0 / 3.0) - self.assertEqual(extract_number("drei Tassen", lang="de-de"), 3) + self.assertEqual(extract_number("1/3 Tasse", lang="de-de"), 1.0 / 3.0) + self.assertEqual(extract_number("eine viertel Tasse", lang="de-de"), - 0.25) + 0.25) + self.assertEqual(extract_number("1/4 Tasse", lang="de-de"), 0.25) + self.assertEqual(extract_number("viertel Tasse", lang="de-de"), 0.25) + self.assertEqual(extract_number("2/3 Tasse", lang="de-de"), 2.0 / 3.0) + self.assertEqual(extract_number("3/4 Tasse", lang="de-de"), 3.0 / 4.0) + self.assertEqual(extract_number("1 und 3/4 Tassen", lang="de-de"), - 1.75) - self.assertEqual(extract_number("1 Tasse und eine halbe", + 1.75) + + self.assertEqual(extract_number("eine und eine halbe Tasse", lang="de-de"), 1.5) - self.assertEqual( - extract_number("eine Tasse und eine halbe", lang="de-de"), 1.5) - self.assertEqual( - extract_number("eine und eine halbe Tasse", lang="de-de"), 1.5) + + self.assertEqual(extract_number("ein halber krug", + lang="de-de"), 0.5) + self.assertEqual(extract_number("ein und ein halb Tassen", lang="de-de"), 1.5) + self.assertEqual(extract_number("drei Viertel Tasse", lang="de-de"), - 3.0 / 4.0) + 3.0 / 4.0) + self.assertEqual(extract_number("drei Viertel Tassen", lang="de-de"), - 3.0 / 4.0) + 3.0 / 4.0) + self.assertEqual(extract_number("Drei Viertel Tassen", lang="de-de"), - 3.0 / 4.0) + 3.0 / 4.0) + + self.assertEqual(extract_number("frisörtermin in einer halben stunde"), + 0.5) + + self.assertEqual(extract_number("frisörtermin in einer viertel stunde"), + 0.25) + + self.assertEqual(extract_number("eine drei viertel stunde"), 0.75) + + self.assertEqual(extract_number("eine dreiviertel stunde"), 0.75) + + self.assertEqual(extract_number("eineinviertel stunden"), 1.25) + + self.assertEqual(extract_number("ein einviertel stunden"), 1.25) + + self.assertEqual(extract_number("ein drittel einer Tasse"), 1.0 / 3.0) + + self.assertEqual(extract_number("neun einhalb tage und neun sekunden"), + 9.5) + + # Ordinal + + self.assertEqual(extract_number("dies ist der 1. Test", + ordinals=True), 1) + + self.assertEqual(extract_number("dies ist der erste Test", + ordinals=True), 1) + + self.assertEqual(extract_number("dies ist zweiter Test", + ordinals=True), 2) + + self.assertEqual(extract_number("dies ist der dritte Test", + ordinals=True), 3) + + self.assertEqual(extract_number("das Vierte", + ordinals=True), 4) + + self.assertEqual(extract_number("von den vier das zweite", + ordinals=True), 2) + + +class TestConvertWordsToNumbers(unittest.TestCase): + # this is mostly tested with extract_number(s), but to test `fractions=False` + def test_convert_words_to_numbers(self): + + # convert "zwei" (2), but dont convert "ein halber" (0.5) + self.assertEqual(_convert_words_to_numbers_de("das ist zwei mal ein halber test", fractions=False), + "das ist 2 mal ein halber test") + + # dont convert "eins komma zwei" / normally 1.2 + self.assertEqual(_convert_words_to_numbers_de("eins komma zwei plus 2 ist", fractions=False), + "eins komma zwei plus 2 ist") + + +class TestExtractNumbers(unittest.TestCase): + + def test_extract_numbers(self): + + self.assertEqual(extract_numbers("test eins zwei drei"), + [1, 2, 3]) + + self.assertEqual(extract_numbers("der vierte soll nur 1.5 ausgeben"), + [1.5]) + + self.assertEqual(extract_numbers("die fünf wird zur 5"), + [5, 5]) + + self.assertEqual(extract_numbers("ein eindrittel Tassel für 2 peronen"), + [4/3, 2]) + + self.assertEqual(extract_numbers("zwei biere für zwei bären"), + [2, 2]) + + self.assertEqual(extract_numbers("zwanzig 20 20 zwanzig"), + [20, 20, 20, 20]) + + self.assertEqual(extract_numbers("zwanzig zwei und zwanzig"), + [20, 22]) + + self.assertEqual(extract_numbers("zwei schweine und sechs millionen bakterien"), + [2, 6000000]) + + self.assertEqual(extract_numbers("das dritte der vier", + ordinals=True), [3, 4]) + + +class TestExtractDatetime(unittest.TestCase): def test_extractdatetime_de(self): def extractWithFormat(text): @@ -96,82 +263,145 @@ def testExtract(text, expected_date, expected_leftover): self.assertEqual(res[0], expected_date) self.assertEqual(res[1], expected_leftover) + testExtract("mache den frisörtermin in einer halben stunde", + "2017-06-27 00:30:00", "mache frisörtermin") + + testExtract("mache den frisörtermin in drei stunden", + "2017-06-27 03:00:00", "mache frisörtermin") + + testExtract("setze den frisörtermin auf halb neun abends", + "2017-06-27 20:30:00", "setze frisörtermin") + + testExtract("setze den frisörtermin auf halb neun am abend", + "2017-06-27 20:30:00", "setze frisörtermin") + + testExtract("setze den timer auf zwölf uhr nachts", + "2017-06-28 00:00:00", "setze timer") + + testExtract("setze den frisörtermin auf halb neun", + "2017-06-27 08:30:00", "setze frisörtermin") + + testExtract("setze den frisörtermin in 5 tagen", + "2017-07-02 00:00:00", "setze frisörtermin") + + testExtract("setze den frisörtermin in 5 tagen um halb 10", + "2017-07-02 09:30:00", "setze frisörtermin") + testExtract("setze den frisörtermin auf 5 tage von heute", "2017-07-02 00:00:00", "setze frisörtermin") + + testExtract("wir bekommen das ergebnis innerhalb eines tages", + "2017-06-28 00:00:00", "wir bekommen das ergebnis innerhalb") + testExtract("wie ist das wetter übermorgen?", "2017-06-29 00:00:00", "wie ist das wetter") + testExtract("erinnere mich um 10:45 abends", "2017-06-27 22:45:00", "erinnere mich") + testExtract("was ist das Wetter am freitag morgen", "2017-06-30 08:00:00", "was ist das wetter") + testExtract("wie ist das wetter morgen", "2017-06-28 00:00:00", "wie ist das wetter") + testExtract( "erinnere mich meine mutter anzurufen in 8 Wochen und 2 Tagen", "2017-08-24 00:00:00", "erinnere mich meine mutter anzurufen") + testExtract("spiele rick astley musik 2 tage von freitag", "2017-07-02 00:00:00", "spiele rick astley musik") + testExtract("starte die invasion um 3:45 pm am Donnerstag", "2017-06-29 15:45:00", "starte die invasion") + testExtract("am montag bestelle kuchen von der bäckerei", "2017-07-03 00:00:00", "bestelle kuchen von bäckerei") + testExtract("spiele happy birthday musik 5 jahre von heute", "2022-06-27 00:00:00", "spiele happy birthday musik") + testExtract("skype mama um 12:45 pm nächsten Donnerstag", "2017-07-06 12:45:00", "skype mama") + testExtract("wie ist das wetter nächsten donnerstag?", "2017-07-06 00:00:00", "wie ist das wetter") + testExtract("wie ist das Wetter nächsten Freitag morgen", "2017-07-07 08:00:00", "wie ist das wetter") + testExtract("wie ist das wetter nächsten freitag abend", "2017-07-07 19:00:00", "wie ist das wetter") + testExtract("wie ist das wetter nächsten freitag nachmittag", "2017-07-07 15:00:00", "wie ist das wetter") + testExtract("erinnere mich mama anzurufen am dritten august", "2017-08-03 00:00:00", "erinnere mich mama anzurufen") + testExtract("kaufe feuerwerk am einundzwanzigsten juli", "2017-07-21 00:00:00", "kaufe feuerwerk") + testExtract("wie ist das wetter 2 wochen ab nächsten freitag", "2017-07-21 00:00:00", "wie ist das wetter") + testExtract("wie ist das wetter am mittwoch um 07:00", "2017-06-28 07:00:00", "wie ist das wetter") + testExtract("wie ist das wetter am mittwoch um 7 uhr", "2017-06-28 07:00:00", "wie ist das wetter") + testExtract("Mache einen Termin um 12:45 pm nächsten donnerstag", - "2017-07-06 12:45:00", "mache einen termin") + "2017-07-06 12:45:00", "mache termin") + testExtract("wie ist das wetter an diesem donnerstag?", "2017-06-29 00:00:00", "wie ist das wetter") + testExtract("vereinbare den besuch für 2 wochen und 6 tage ab samstag", "2017-07-21 00:00:00", "vereinbare besuch") + testExtract("beginne die invasion um 03:45 am donnerstag", "2017-06-29 03:45:00", "beginne die invasion") + testExtract("beginne die invasion um 3 uhr nachts am donnerstag", "2017-06-29 03:00:00", "beginne die invasion") + testExtract("beginne die invasion um 8 Uhr am donnerstag", "2017-06-29 08:00:00", "beginne die invasion") + testExtract("starte die party um 8 uhr abends am donnerstag", "2017-06-29 20:00:00", "starte die party") + testExtract("starte die invasion um 8 abends am donnerstag", "2017-06-29 20:00:00", "starte die invasion") + testExtract("starte die invasion am donnerstag um mittag", "2017-06-29 12:00:00", "starte die invasion") + testExtract("starte die invasion am donnerstag um mitternacht", "2017-06-29 00:00:00", "starte die invasion") + testExtract("starte die invasion am donnerstag um 5 uhr", "2017-06-29 05:00:00", "starte die invasion") + testExtract("erinnere mich aufzuwachen in 4 jahren", "2021-06-27 00:00:00", "erinnere mich aufzuwachen") + testExtract("erinnere mich aufzuwachen in 4 jahren und 4 tagen", "2021-07-01 00:00:00", "erinnere mich aufzuwachen") + testExtract("wie ist das wetter 3 Tage nach morgen?", "2017-07-01 00:00:00", "wie ist das wetter") + testExtract("dritter dezember", "2017-12-03 00:00:00", "") + testExtract("lass uns treffen um 8:00 abends", "2017-06-27 20:00:00", "lass uns treffen") def test_extractdatetime_no_time(self): """Check that None is returned if no time is found in sentence.""" + self.assertEqual(extract_datetime('kein zeit', lang='de-de'), None) def test_extractdatetime_default_de(self): @@ -179,79 +409,67 @@ def test_extractdatetime_default_de(self): anchor = datetime(2017, 6, 27, 0, 0) res = extract_datetime("lass uns treffen am freitag", anchor, lang='de-de', default_time=default) + self.assertEqual(default, res[0].time()) +class TestExtractDuration(unittest.TestCase): def test_extract_duration_de(self): + self.assertEqual(extract_duration("10 sekunden", lang="de-de"), - (timedelta(seconds=10.0), "")) + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 minuten", lang="de-de"), - (timedelta(minutes=5), "")) + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 stunden", lang="de-de"), - (timedelta(hours=2), "")) + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 tage", lang="de-de"), - (timedelta(days=3), "")) + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 wochen", lang="de-de"), - (timedelta(weeks=25), "")) - # TODO no german text to number parsing yet - #self.assertEqual(extract_duration("sieben stunden"), - # (timedelta(hours=7), "")) - self.assertEqual(extract_duration("7.5 sekunden", lang="de-de"), - (timedelta(seconds=7.5), "")) - #self.assertEqual(extract_duration("eight and a half days thirty" - # " nine seconds"), - # (timedelta(days=8.5, seconds=39), "")) + (timedelta(weeks=25), "")) + + self.assertEqual(extract_duration("sieben stunden"), + (timedelta(hours=7), "")) + + self.assertEqual(extract_duration("7,5 sekunden", lang="de-de"), + (timedelta(seconds=7.5), "")) + + self.assertEqual(extract_duration(("neun einhalb tage und " + "10 minuten")), + (timedelta(days=9.5, minutes=10), "und")) + self.assertEqual(extract_duration("starte timer für 30 minuten", lang="de-de"), - (timedelta(minutes=30), "starte timer für")) - #self.assertEqual(extract_duration("Four and a half minutes until" - # " sunset"), - # (timedelta(minutes=4.5), "until sunset")) - #self.assertEqual(extract_duration("Nineteen minutes past the hour"), - # (timedelta(minutes=19), "past the hour")) - self.assertEqual(extract_duration("weck mich in 3 wochen, " - " 497 tage und" - " 391.6 sekunden", lang="de-de"), - (timedelta(weeks=3, days=497, seconds=391.6), - "weck mich in , und")) - #self.assertEqual(extract_duration("The movie is one hour, fifty seven" - # " and a half minutes long"), - # (timedelta(hours=1, minutes=57.5), - # "the movie is , long")) - self.assertEqual(extract_duration("10-sekunden", lang="de-de"), - (timedelta(seconds=10.0), "")) - self.assertEqual(extract_duration("5-minuten", lang="de-de"), - (timedelta(minutes=5), "")) + (timedelta(minutes=30), "starte timer für")) - def test_spaces(self): - self.assertEqual(normalize(" dies ist ein test", lang="de-de"), - "dies ist 1 test") - self.assertEqual(normalize(" dies ist ein test ", - lang="de-de"), "dies ist 1 test") + self.assertEqual(extract_duration(("viereinhalb minuten bis" + " sonnenuntergang")), + (timedelta(minutes=4.5), "bis sonnenuntergang")) - def test_numbers(self): - self.assertEqual( - normalize("dies ist eins zwei drei test", lang="de-de"), - "dies ist 1 2 3 test") - self.assertEqual( - normalize("es ist vier fünf sechs test", lang="de-de"), - "es ist 4 5 6 test") - self.assertEqual( - normalize("es ist sieben acht neun test", lang="de-de"), - "es ist 7 8 9 test") - self.assertEqual( - normalize("es ist sieben acht neun test", lang="de-de"), - "es ist 7 8 9 test") - self.assertEqual( - normalize("dies ist zehn elf zwölf test", lang="de-de"), - "dies ist 10 11 12 test") - self.assertEqual( - normalize("dies ist dreizehn vierzehn test", lang="de-de"), - "dies ist 13 14 test") - self.assertEqual( - normalize("dies ist fünfzehn sechzehn siebzehn", lang="de-de"), - "dies ist 15 16 17") - self.assertEqual( - normalize("dies ist achtzehn neunzehn zwanzig", lang="de-de"), - "dies ist 18 19 20") + self.assertEqual(extract_duration("neunzehn minuten nach acht"), + (timedelta(minutes=19), "nach 8")) + + self.assertEqual(extract_duration(("weck mich in 3 wochen," + " 497 tagen und" + " 391.6 sekunden"), lang="de-de"), + (timedelta(weeks=3, days=497, seconds=391.6), + "weck mich in, und")) + + self.assertEqual(extract_duration("weck mich in einer viertel stunde"), + (timedelta(hours=0.25), "weck mich in")) + + self.assertEqual(extract_duration(("der film ist eine stunde, fünfzehn" + " einhalb minuten lang")), + (timedelta(hours=1, minutes=15.5), + "der film ist, lang")) + + # wenn überhaupt wäre anstatt -sekunde -sekündig[e][ns] notwendig + self.assertEqual(extract_duration("10-sekunden", lang="de-de"), + (timedelta(seconds=10.0), "")) + + self.assertEqual(extract_duration("5-minuten", lang="de-de"), + (timedelta(minutes=5), "")) if __name__ == "__main__":