get_plural_form_en/pt

port MycroftAI#36 + MycroftAI#37 add pt pluralizations.json add tests
OpenVoiceOS · Jul 13, 2022 · 36c4de1 · 36c4de1
1 parent 74c60ec
commit 36c4de1
Show file tree

Hide file tree

Showing 9 changed files with 270 additions and 33 deletions.
diff --git a/lingua_franca/format.py b/lingua_franca/format.py
@@ -636,7 +636,7 @@ def get_plural_category(amount, type=PluralCategory.CARDINAL, lang=""):
  raise FunctionNotLocalizedError("This function has not been implemented in the specified language.")
 
 
-@localized_function()
+@localized_function(run_own_code_on=[FunctionNotLocalizedError])
 def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""):
  """
  Get plural form of the specified word for the specified amount.
@@ -651,3 +651,5 @@ def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""):
  Returns:
  (str): Pluralized word.
  """
+ warn(RuntimeWarning("Pluralization has not been implemented in the specified language. Word unchanged"))
+ return word
diff --git a/lingua_franca/lang/common_data_pt.py b/lingua_franca/lang/common_data_pt.py
@@ -1,3 +1,6 @@
+from lingua_franca.lang.parse_common import invert_dict
+
+
 _FUNCTION_NOT_IMPLEMENTED_WARNING = "esta função não foi implementada em 'pt'"
 
 # Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
@@ -20,6 +23,60 @@
 _MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"]
 _FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"]
 
+
+# constants used for singularize / pluralize
+_VOWELS_PT = ["a", "ã", "á", "à",
+ "e", "é", "è",
+ "i", "ì", "í",
+ "o", "ó", "ò", "õ",
+ "u", "ú", "ù"]
+
+_INVARIANTS_PT = ["ontem", "depressa", "ali", "além", "sob", "por", "contra", "desde", "entre",
+ "até", "perante", "porém", "contudo", "todavia", "entretanto", "senão", "portanto",
+ "oba", "eba", "exceto", "excepto", "apenas", "menos", "também", "inclusive", "aliás",
+ "que", "onde", "isto", "isso", "aquilo", "algo", "alguém", "nada", "ninguém", "tudo", "cada",
+ "outrem", "quem", "mais", "menos", "demais",
+ # NOTE some words ommited because it depends on POS_TAG
+ # NOTE these multi word expressions are also invariant
+ "ou melhor", "isto é", "por exemplo", "a saber", "digo", "ou seja",
+ "por assim dizer", "com efeito", "ou antes"]
+
+_PLURAL_EXCEPTIONS_PT = {
+ "cânon": "cânones",
+ "cós": "coses", # cós (unchanged word) is also valid
+ "cais": "cais",
+ "xis": "xis",
+ "mal": "males",
+ "cônsul": "cônsules",
+ "mel": "méis", # "meles" also valid
+ "fel": "féis", # "feles" also valid
+ "cal": "cais", # "cales" also valid
+ "aval": "avais", # "avales also valid
+ "mol": "móis", # "moles also valid
+ "real": "réis",
+ "fax": "faxes",
+ "cálix": "cálices",
+ "índex": "índices",
+ "apêndix": "apêndices",
+ "hélix": "hélices",
+ "hálux": "háluces",
+ "códex": "códices",
+ "fénix": "fénixes", # "fénix" also valid
+ "til": "tis", # "tiles" also valid
+ "pão": "pães",
+ "cão": "cães",
+ "alemão": "alemães",
+ "balão": "balões",
+ "anão": "anões",
+ "dez": "dez",
+ "três": "três",
+ "seis": "seis"
+}
+
+# in general words that end with "s" in singular form should be added bellow
+_SINGULAR_EXCEPTIONS_PT = invert_dict(_PLURAL_EXCEPTIONS_PT)
+
+# constants for number handling
 _NUMBERS_PT = {
  "zero": 0,
  "um": 1,

diff --git a/lingua_franca/lang/format_en.py b/lingua_franca/lang/format_en.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+import inflection
 from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount
 from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \
  _FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, _LONG_ORDINAL_EN
@@ -411,3 +411,21 @@ def get_plural_category_en(amount, type=PluralCategory.CARDINAL):
 
  else:
  return ValueError("Argument \"type\" must be cardinal|ordinal|range")
+
+
+def get_plural_form_en(word, amount, type=PluralCategory.CARDINAL):
+ """
+ Get plural form of the specified word for the specified amount.
+
+ Args:
+ word(str): Word to be pluralized.
+ amount(int or float or pair or list): The amount that is used to
+ determine the category. If type is range, it must contain
+ the start and end numbers.
+ type(str): Either cardinal (default), ordinal or range.
+ Returns:
+ (str): Pluralized word.
+ """
+ if amount == 1:
+ return inflection.singularize(word)
+ return inflection.pluralize(word)
diff --git a/lingua_franca/lang/format_pt.py b/lingua_franca/lang/format_pt.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 #
 
-from lingua_franca.lang.format_common import convert_to_mixed_fraction
+from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount
 from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \
- _NUM_STRING_PT
+ _NUM_STRING_PT, _VOWELS_PT, _PLURAL_EXCEPTIONS_PT, _SINGULAR_EXCEPTIONS_PT, _INVARIANTS_PT
 
 
 def nice_number_pt(number, speech, denominators=range(1, 21)):
@@ -221,3 +221,83 @@ def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False):
  elif hour != 0 and hour != 12:
  speak += " da noite"
  return speak
+
+
+def _singularize_pt(word):
+ if word in _INVARIANTS_PT:
+ return _INVARIANTS_PT[word]
+ if word in _SINGULAR_EXCEPTIONS_PT:
+ return _SINGULAR_EXCEPTIONS_PT[word]
+ # TODO implement is_plural helper
+ # can not ensure word is in plural, assuming it is,
+ # if in singular form it might in some cases be wrongly mutated
+ # in general words that end with "s" in singular form should be added to exceptions dict
+ if word.endswith("is"):
+ return word.rstrip("is") + "il"
+ if word.endswith("ões"):
+ return word.replace("ões", "ão")
+ if word.endswith("ães"):
+ return word.replace("ães", "ão")
+ if word.endswith("es"):
+ return word.rstrip("es")
+ if word.endswith("s"):
+ return word.rstrip("s")
+ return word
+
+
+def _pluralize_pt(word):
+ if word in _INVARIANTS_PT:
+ return word
+ if word in _PLURAL_EXCEPTIONS_PT:
+ return _PLURAL_EXCEPTIONS_PT[word]
+ if word.endswith("x"):
+ return word
+ if word.endswith("s"):
+ # TODO - this will catch too many words, need a better check
+ #if word[-2] in _VOWELS_PT or word[-3] in _VOWELS_PT:
+ # if word is an oxytone, add "es", else word remains unchanged
+ # https://en.wikipedia.org/wiki/Oxytone
+ # return word + "es"
+ return word
+ if word.endswith("ão"):
+ # crap, can either end with "ãos", "aẽs" or "ões", most times they are all valid
+ # the other times lets hope the word is in exceptions dict
+ # TODO check if numeric, then it's always "ões"
+ return word + "s"
+ if word[-1] in _VOWELS_PT:
+ # if word ends with a vowel add an "s"
+ return word + 's'
+ for ending in ["r", "z", "n"]:
+ if word.endswith(ending):
+ return word + "es"
+ for ending in ["al", "el", "ol", "ul"]:
+ if word.endswith(ending):
+ return word.rstrip("l") + "is"
+ if word.endswith("il"):
+ return word.rstrip("l") + "s"
+ if word.endswith("m"):
+ return word.rstrip("m") + "ns"
+ # foreign words that have been "unportuguesified" have an "s" added
+ # simple check is looking for endings that don't exist in portuguese
+ for ending in ["w", "y", "k", "t"]:
+ if word.endswith(ending):
+ return word + "s"
+ return word
+
+
+def get_plural_form_pt(word, amount, type=PluralCategory.CARDINAL):
+ """
+ Get plural form of the specified word for the specified amount.
+
+ Args:
+ word(str): Word to be pluralized.
+ amount(int or float or pair or list): The amount that is used to
+ determine the category. If type is range, it must contain
+ the start and end numbers.
+ type(str): Either cardinal (default), ordinal or range.
+ Returns:
+ (str): Pluralized word.
+ """
+ if amount == 1:
+ return _singularize_pt(word)
+ return _pluralize_pt(word)
diff --git a/lingua_franca/res/text/pt-pt/pluralizations.json b/lingua_franca/res/text/pt-pt/pluralizations.json
@@ -0,0 +1,24 @@
+{
+ "day": {
+ "one": "dia",
+ "other": "dias"
+ },
+ "hour": {
+ "one": "hora",
+ "other": "horas"
+ },
+ "minute": {
+ "one": "minuto",
+ "other": "minutos"
+ },
+ "second": {
+ "one": "segundo",
+ "other": "segundos"
+ },
+ "and": {
+ "one": "e"
+ },
+ "or": {
+ "one": "ou"
+ }
+}
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,2 +1,3 @@
 python-dateutil~=2.6
-rapidfuzz
+rapidfuzz
+inflection
diff --git a/test/unittests/test_format_en.py b/test/unittests/test_format_en.py
@@ -13,30 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import unittest
 import datetime
 import sys
+import unittest
+
 # TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES,
 # or make it public somehow
-from lingua_franca import load_language, unload_language, set_default_lang, \
- get_primary_lang_code, get_active_langs, get_supported_langs
-from lingua_franca.internal import UnsupportedLanguageError
-from lingua_franca.format import nice_number
-from lingua_franca.format import nice_time
-from lingua_franca.format import nice_date
-from lingua_franca.format import nice_date_time
+from lingua_franca import load_language, unload_language, set_default_lang
+from lingua_franca.format import get_plural_category
+from lingua_franca.format import join_list, get_plural_form
 from lingua_franca.format import nice_duration
-from lingua_franca.format import nice_number, get_plural_category
+from lingua_franca.format import nice_number
 from lingua_franca.format import nice_time
-from lingua_franca.format import nice_year
-from lingua_franca.format import nice_duration
-from lingua_franca.format import pronounce_number
-from lingua_franca.format import date_time_format
-from lingua_franca.format import join_list
 from lingua_franca.format import pronounce_lang
-from lingua_franca.time import default_timezone, set_default_tz, now_local, \
- to_local
-
+from lingua_franca.format import pronounce_number
+from lingua_franca.time import default_timezone
 
 
 def setUpModule():
@@ -80,7 +71,6 @@ def tearDownModule():
 
 
 class TestNiceNumberFormat(unittest.TestCase):
-
  tmp_var = None
 
  def set_tmp_var(self, val):
@@ -372,7 +362,7 @@ def test_ordinals(self):
 class TestNiceDateFormat(unittest.TestCase):
 
  def test_convert_times(self):
- dt = datetime.datetime(2017, 1, 31, 
+ dt = datetime.datetime(2017, 1, 31,
  13, 22, 3, tzinfo=default_timezone())
 
  # Verify defaults haven't changed
@@ -495,7 +485,6 @@ def test_convert_times(self):
  self.assertEqual(nice_time(dt),
  "quarter to two")
 
-
  def test_nice_duration(self):
  self.assertEqual(nice_duration(1), "one second")
  self.assertEqual(nice_duration(3), "three seconds")
@@ -576,5 +565,34 @@ def test_range_numbers(self):
  self.assertEqual(get_plural_category((0, 2), type="range"), "other")
 
 
+class TestInflection(unittest.TestCase):
+ def test_singularize(self):
+ self.assertEqual(get_plural_form("posts", 1), "post")
+ self.assertEqual(get_plural_form("octopi", 1), "octopus")
+ self.assertEqual(get_plural_form("sheep", 1), "sheep")
+ # test already singular
+ self.assertEqual(get_plural_form("word", 1), "word")
+ # test garbage
+ self.assertEqual(get_plural_form("CamelOctopi", 1), "CamelOctopus")
+
+ def test_pluralize(self):
+ self.assertEqual(get_plural_form("post", 2), "posts")
+ self.assertEqual(get_plural_form("octopus", 3), "octopi")
+ self.assertEqual(get_plural_form("sheep", 4), "sheep")
+ # test already plural
+ self.assertEqual(get_plural_form("words", 5), "words")
+ # irregular verbs
+ self.assertEqual(get_plural_form("person", 6), "people")
+ self.assertEqual(get_plural_form("man", 2), "men")
+ self.assertEqual(get_plural_form("human", 3), "humans")
+ self.assertEqual(get_plural_form('child', 4), 'children')
+ self.assertEqual(get_plural_form('sex', 2), 'sexes')
+ self.assertEqual(get_plural_form('move', 3), 'moves')
+ self.assertEqual(get_plural_form('cow', 4), 'kine')
+ self.assertEqual(get_plural_form('zombie', 5), 'zombies')
+ # test garbage
+ self.assertEqual(get_plural_form("CamelOctopus", 6), "CamelOctopi")
+
+
 if __name__ == "__main__":
  unittest.main()
diff --git a/test/unittests/test_format_pt.py b/test/unittests/test_format_pt.py
@@ -20,6 +20,7 @@
 from lingua_franca import load_language, unload_language, set_default_lang
 from lingua_franca.format import nice_time
 from lingua_franca.format import pronounce_number
+from lingua_franca.format import get_plural_form
 from lingua_franca.time import default_timezone
 
 
@@ -309,5 +310,35 @@ def test_minutes_past_hour(self):
  "onze e um quarto da noite")
 
 
+class TestInflection(unittest.TestCase):
+ def test_singularize(self):
+ self.assertEqual(get_plural_form("homems", 1), "homem")
+ self.assertEqual(get_plural_form("cavalos", 1), "cavalo")
+ self.assertEqual(get_plural_form("ovelhas", 1), "ovelha")
+ # test already singular
+ self.assertEqual(get_plural_form("palavra", 1), "palavra")
+ # test garbage
+ self.assertEqual(get_plural_form("gerubicios", 1), "gerubicio")
+
+ def test_pluralize(self):
+ self.assertEqual(get_plural_form("poste", 2), "postes")
+ self.assertEqual(get_plural_form("polvo", 3), "polvos")
+ self.assertEqual(get_plural_form("ovelha", 4), "ovelhas")
+ # test already plural
+ self.assertEqual(get_plural_form("palavras", 5), "palavras")
+ self.assertEqual(get_plural_form("ovelhas", 3), "ovelhas")
+ # irregular/invariant verbs
+ self.assertEqual(get_plural_form("anão", 6), "anões")
+ self.assertEqual(get_plural_form("alemão", 2), "alemães")
+ self.assertEqual(get_plural_form("apêndix", 3), "apêndices")
+ self.assertEqual(get_plural_form('três', 4), 'três')
+ self.assertEqual(get_plural_form('seis', 2), 'seis')
+ self.assertEqual(get_plural_form('ontem', 3), 'ontem')
+ self.assertEqual(get_plural_form('depressa', 4), 'depressa')
+ self.assertEqual(get_plural_form('contra', 5), 'contra')
+ # test garbage
+ self.assertEqual(get_plural_form("gerubicio", 6), "gerubicios")
+
+
 if __name__ == "__main__":
  unittest.main()