Skip to content

Commit

Permalink
german parse overhaul (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
emphasize authored May 19, 2023
1 parent 0c4ead9 commit 37436d4
Show file tree
Hide file tree
Showing 5 changed files with 1,065 additions and 528 deletions.
227 changes: 161 additions & 66 deletions lingua_franca/lang/common_data_de.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,12 @@
_DE_NUMBERS = {
'null': 0,
'ein': 1,
'eins': 1,
'eine': 1,
'einer': 1,
'einem': 1,
'einen': 1,
'eines': 1,
'zwei': 2,
'drei': 3,
'vier': 4,
'fünf': 5,
'sechs': 6,
'sieben': 7,
'acht': 8,
'neun': 9,
'zehn': 10,
'elf': 11,
'zwölf': 12,
'dreizehn': 13,
'vierzehn': 14,
'fünfzehn': 15,
'sechzehn': 16,
'siebzehn': 17,
'achtzehn': 18,
'neunzehn': 19,
'zwanzig': 20,
'einundzwanzig': 21,
'zweiundzwanzig': 22,
'dreiundzwanzig': 23,
'vierundzwanzig': 24,
'fünfundzwanzig': 25,
'sechsundzwanzig': 26,
'siebenundzwanzig': 27,
'achtundzwanzig': 28,
'neunundzwanzig': 29,
'dreißig': 30,
'einunddreißig': 31,
'vierzig': 40,
'fünfzig': 50,
'sechzig': 60,
'siebzig': 70,
'achtzig': 80,
'neunzig': 90,
'hundert': 100,
'zweihundert': 200,
'dreihundert': 300,
'vierhundert': 400,
'fünfhundert': 500,
'sechshundert': 600,
'siebenhundert': 700,
'achthundert': 800,
'neunhundert': 900,
'tausend': 1000,
'million': 1000000
}
from collections import OrderedDict
from lingua_franca.lang.parse_common import invert_dict

_MONTHS_DE = ['januar', 'februar', 'märz', 'april', 'mai', 'juni',
'juli', 'august', 'september', 'oktober', 'november',
'dezember']
_ARTICLES = {'der', 'das', 'die', 'dem', 'den'}

_NUM_STRING_DE = {
#_SPOKEN_NUMBER
_NUM_STRING = {
0: 'null',
1: 'ein', # ein Viertel etc., nicht eins Viertel
1: 'eins',
2: 'zwei',
3: 'drei',
4: 'vier',
Expand All @@ -89,20 +33,44 @@
70: 'siebzig',
80: 'achtzig',
90: 'neunzig',
100: 'hundert'
100: 'hundert',
200: 'zweihundert',
300: 'dreihundert',
400: 'vierhundert',
500: 'fünfhundert',
600: 'sechshundert',
700: 'siebenhundert',
800: 'achthundert',
900: 'neunhundert',
1000: 'tausend',
1000000: 'million'
}

_STRING_NUM = invert_dict(_NUM_STRING)
_STRING_NUM.update({
'ein': 1,
'eine': 1,
'einer': 1,
'eines': 1,
'einem': 1,
'einen': 1
})

_MONTHS = ['januar', 'februar', 'märz', 'april', 'mai', 'juni',
'juli', 'august', 'september', 'oktober', 'november',
'dezember']

# German uses "long scale" https://en.wikipedia.org/wiki/Long_and_short_scales
# Currently, numbers are limited to 1000000000000000000000000,
# but _NUM_POWERS_OF_TEN can be extended to include additional number words


_NUM_POWERS_OF_TEN_DE = [
_NUM_POWERS_OF_TEN = [
'', 'tausend', 'Million', 'Milliarde', 'Billion', 'Billiarde', 'Trillion',
'Trilliarde'
]

_FRACTION_STRING_DE = {
_FRACTION_STRING = {
2: 'halb',
3: 'drittel',
4: 'viertel',
Expand All @@ -124,6 +92,16 @@
20: 'zwanzigstel'
}

_STRING_FRACTION = invert_dict(_FRACTION_STRING)
_STRING_FRACTION.update({
'halb': 2,
'halbe': 2,
'halben': 2,
'halbes': 2,
'halber': 2,
'halbem': 2
})

# Numbers below 1 million are written in one word in German, yielding very
# long words
# In some circumstances it may better to seperate individual words
Expand All @@ -132,4 +110,121 @@
# Set _EXTRA_SPACE_DA="" for correct spelling, this is standard

# _EXTRA_SPACE_DA = " "
_EXTRA_SPACE_DE = ""
_EXTRA_SPACE = ""

_ORDINAL_BASE = {
"1.": "erst",
"2.": "zweit",
"3.": "dritt",
"4.": "viert",
"5.": "fünft",
"6.": "sechst",
"7.": "siebt",
"8.": "acht",
"9.": "neunt",
"10.": "zehnt",
"11.": "elft",
"12.": "zwölft",
"13.": "dreizehnt",
"14.": "vierzehnt",
"15.": "fünfzehnt",
"16.": "sechzehnt",
"17.": "siebzehnt",
"18.": "achtzehnt",
"19.": "neunzehnt",
"20.": "zwanzigst",
"21.": "einundzwanzigst",
"22.": "zweiundzwanzigst",
"23.": "dreiundzwanzigst",
"24.": "vierundzwanzigst",
"25.": "fünfundzwanzigst",
"26.": "sechsundzwanzigst",
"27.": "siebenundzwanzigst",
"28.": "achtundzwanzigst",
"29.": "neunundzwanzigst",
"30.": "dreißigst",
"31.": "einunddreißigst",
"32.": "zweiunddreißigst",
"33.": "dreiunddreißigst",
"34.": "vierunddreißigst",
"35.": "fünfunddreißigst",
"36.": "sechsunddreißigst",
"37.": "siebenunddreißigst",
"38.": "achtunddreißigst",
"39.": "neununddreißigst",
"40.": "vierzigst",
"41.": "einundvierzigst",
"42.": "zweiundvierzigst",
"43.": "dreiundvierzigst",
"44.": "vierundvierzigst",
"45.": "fünfundvierzigst",
"46.": "sechsundvierzigst",
"47.": "siebenundvierzigst",
"48.": "achtundvierzigst",
"49.": "neunundvierzigst",
"50.": "fünfzigst",
"51.": "einundfünfzigst",
"52.": "zweiundfünfzigst",
"53.": "dreiundfünfzigst",
"60.": "sechzigst",
"70.": "siebzigst",
"80.": "achtzigst",
"90.": "neunzigst",
"100.": "einhundertst",
"1000.": "eintausendst",
"1000000.": "millionst"
}

_LONG_SCALE = OrderedDict([
(100, 'hundert'),
(1000, 'tausend'),
(1000000, 'million'),
(1e9, "milliarde"),
(1e12, 'billion'),
(1e15, "billiarde"),
(1e18, "trillion"),
(1e21, "trilliarde"),
(1e24, "quadrillion"),
(1e27, "quadrilliarde")
])

_MULTIPLIER = set(_LONG_SCALE.values())

_STRING_LONG_SCALE = invert_dict(_LONG_SCALE)

# ending manipulation
for number, item in _LONG_SCALE.items():
if int(number) > 1000:
if item.endswith('e'):
name = item + 'n'
_MULTIPLIER.add(name)
_STRING_LONG_SCALE[name] = number
else:
name = item + 'en'
_MULTIPLIER.add(name)
_STRING_LONG_SCALE[name] = number

_LONG_ORDINAL = {
1e6: "millionst",
1e9: "milliardst",
1e12: "billionst",
1e15: "billiardst",
1e18: "trillionst",
1e21: "trilliardst",
1e24: "quadrillionst",
1e27: "quadrilliardst"
}

_LONG_ORDINAL.update(_ORDINAL_BASE)

# dict für erste, drittem, millionstes ...
_STRING_LONG_ORDINAL = {ord+ending: num for ord, num in invert_dict(_LONG_ORDINAL).items()
for ending in ("en", "em", "es", "er", "e")}

_FRACTION_MARKER = set()

_NEGATIVES = {"minus"}

_NUMBER_CONNECTORS = {"und"}

_COMMA = {"komma", "comma", "punkt"}
45 changes: 27 additions & 18 deletions lingua_franca/lang/format_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@
# limitations under the License.
#

from lingua_franca.lang.format_common import convert_to_mixed_fraction
from lingua_franca.lang.common_data_de import _EXTRA_SPACE_DE, \
_FRACTION_STRING_DE, _MONTHS_DE, _NUM_POWERS_OF_TEN_DE, _NUM_STRING_DE
from math import floor

from lingua_franca.lang.format_common import convert_to_mixed_fraction
from lingua_franca.lang.common_data_de import (
_EXTRA_SPACE,
_FRACTION_STRING,
_MONTHS,
_NUM_POWERS_OF_TEN,
_NUM_STRING
)


def nice_number_de(number, speech=True, denominators=range(1, 21)):
""" German helper for nice_number
Expand All @@ -44,7 +50,7 @@ def nice_number_de(number, speech=True, denominators=range(1, 21)):
return '{} {}/{}'.format(whole, num, den)
if num == 0:
return str(whole)
den_str = _FRACTION_STRING_DE[den]
den_str = _FRACTION_STRING[den]
if whole == 0:
if num == 1:
return_string = 'ein {}'.format(den_str)
Expand Down Expand Up @@ -85,24 +91,27 @@ def pronounce_triplet_de(num):
if num > 99:
hundreds = floor(num / 100)
if hundreds > 0:
result += _NUM_STRING_DE[
hundreds] + _EXTRA_SPACE_DE + 'hundert' + _EXTRA_SPACE_DE
number = _NUM_STRING[hundreds] if hundreds > 1 else "ein"
result += number + 'hundert' + _EXTRA_SPACE
num -= hundreds * 100
if num == 0:
result += '' # do nothing
elif num == 1:
result += 'eins' # need the s for the last digit
elif num <= 20:
result += _NUM_STRING_DE[num] # + _EXTRA_SPACE_DA
result += _NUM_STRING[num] # + _EXTRA_SPACE_DA
elif num > 20:
ones = num % 10
tens = num - ones
if ones > 0:
result += _NUM_STRING_DE[ones] + _EXTRA_SPACE_DE
number = _NUM_STRING[ones]
if ones == 1 and tens > 0: # eins > ein
number = number[:-1]
result += number + _EXTRA_SPACE
if tens > 0:
result += 'und' + _EXTRA_SPACE_DE
result += 'und' + _EXTRA_SPACE
if tens > 0:
result += _NUM_STRING_DE[tens] + _EXTRA_SPACE_DE
result += _NUM_STRING[tens] + _EXTRA_SPACE
return result

def pronounce_fractional_de(num,
Expand All @@ -112,7 +121,7 @@ def pronounce_fractional_de(num,
place = 10
while places > 0: # doesn't work with 1.0001 and places = 2: int(
# number*place) % 10 > 0 and places > 0:
result += " " + _NUM_STRING_DE[int(num * place) % 10]
result += " " + _NUM_STRING[int(num * place) % 10]
if int(num * place) % 10 == 1:
result += 's' # "1" is pronounced "eins" after the decimal
# point
Expand All @@ -135,18 +144,18 @@ def pronounce_whole_number_de(num, scale_level=0):
else:
result += "eins"
elif scale_level == 1:
result += 'ein' + _EXTRA_SPACE_DE + 'tausend' + _EXTRA_SPACE_DE
result += 'ein' + _EXTRA_SPACE + 'tausend' + _EXTRA_SPACE
else:
result += "eine " + _NUM_POWERS_OF_TEN_DE[scale_level] + ' '
result += "eine " + _NUM_POWERS_OF_TEN[scale_level] + ' '
elif last_triplet > 1:
result += pronounce_triplet_de(last_triplet)
if scale_level == 1:
# result += _EXTRA_SPACE_DA
result += 'tausend' + _EXTRA_SPACE_DE
result += 'tausend' + _EXTRA_SPACE
if scale_level >= 2:
# if _EXTRA_SPACE_DA == '':
# result += " "
result += " " + _NUM_POWERS_OF_TEN_DE[scale_level]
result += " " + _NUM_POWERS_OF_TEN[scale_level]
if scale_level >= 2:
if scale_level % 2 == 0:
result += "e" # MillionE
Expand All @@ -161,7 +170,7 @@ def pronounce_whole_number_de(num, scale_level=0):
if abs(number) >= 1000000000000000000000000: # cannot do more than this
return str(number)
elif number == 0:
return str(_NUM_STRING_DE[0])
return str(_NUM_STRING[0])
elif number < 0:
return "minus " + pronounce_number_de(abs(number), places)
else:
Expand Down Expand Up @@ -278,7 +287,7 @@ def nice_response_de(text):
words = text.split()

for idx, word in enumerate(words):
if word.lower() in _MONTHS_DE:
if word.lower() in _MONTHS:
text = _nice_ordinal_de(text)

if word == '^':
Expand All @@ -300,7 +309,7 @@ def _nice_ordinal_de(text, speech=True):
wordPrev = words[idx - 1] if idx > 0 else ""
if word[-1:] == ".":
if word[:-1].isdecimal():
if wordNext.lower() in _MONTHS_DE:
if wordNext.lower() in _MONTHS:
word = pronounce_ordinal_de(int(word[:-1]))
if wordPrev.lower() in ["am", "dem", "vom", "zum",
"(vom", "(am", "zum"]:
Expand Down
Loading

0 comments on commit 37436d4

Please sign in to comment.