Skip to content

Commit

Permalink
get_plural_form_en/pt
Browse files Browse the repository at this point in the history
port MycroftAI#36 + MycroftAI#37

add pt pluralizations.json

add tests
  • Loading branch information
JarbasAl committed Jul 13, 2022
1 parent 74c60ec commit 36c4de1
Show file tree
Hide file tree
Showing 9 changed files with 270 additions and 33 deletions.
4 changes: 3 additions & 1 deletion lingua_franca/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def get_plural_category(amount, type=PluralCategory.CARDINAL, lang=""):
raise FunctionNotLocalizedError("This function has not been implemented in the specified language.")


@localized_function()
@localized_function(run_own_code_on=[FunctionNotLocalizedError])
def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""):
"""
Get plural form of the specified word for the specified amount.
Expand All @@ -651,3 +651,5 @@ def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""):
Returns:
(str): Pluralized word.
"""
warn(RuntimeWarning("Pluralization has not been implemented in the specified language. Word unchanged"))
return word
57 changes: 57 additions & 0 deletions lingua_franca/lang/common_data_pt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from lingua_franca.lang.parse_common import invert_dict


_FUNCTION_NOT_IMPLEMENTED_WARNING = "esta função não foi implementada em 'pt'"

# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
Expand All @@ -20,6 +23,60 @@
_MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"]
_FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"]


# constants used for singularize / pluralize
_VOWELS_PT = ["a", "ã", "á", "à",
"e", "é", "è",
"i", "ì", "í",
"o", "ó", "ò", "õ",
"u", "ú", "ù"]

_INVARIANTS_PT = ["ontem", "depressa", "ali", "além", "sob", "por", "contra", "desde", "entre",
"até", "perante", "porém", "contudo", "todavia", "entretanto", "senão", "portanto",
"oba", "eba", "exceto", "excepto", "apenas", "menos", "também", "inclusive", "aliás",
"que", "onde", "isto", "isso", "aquilo", "algo", "alguém", "nada", "ninguém", "tudo", "cada",
"outrem", "quem", "mais", "menos", "demais",
# NOTE some words ommited because it depends on POS_TAG
# NOTE these multi word expressions are also invariant
"ou melhor", "isto é", "por exemplo", "a saber", "digo", "ou seja",
"por assim dizer", "com efeito", "ou antes"]

_PLURAL_EXCEPTIONS_PT = {
"cânon": "cânones",
"cós": "coses", # cós (unchanged word) is also valid
"cais": "cais",
"xis": "xis",
"mal": "males",
"cônsul": "cônsules",
"mel": "méis", # "meles" also valid
"fel": "féis", # "feles" also valid
"cal": "cais", # "cales" also valid
"aval": "avais", # "avales also valid
"mol": "móis", # "moles also valid
"real": "réis",
"fax": "faxes",
"cálix": "cálices",
"índex": "índices",
"apêndix": "apêndices",
"hélix": "hélices",
"hálux": "háluces",
"códex": "códices",
"fénix": "fénixes", # "fénix" also valid
"til": "tis", # "tiles" also valid
"pão": "pães",
"cão": "cães",
"alemão": "alemães",
"balão": "balões",
"anão": "anões",
"dez": "dez",
"três": "três",
"seis": "seis"
}

# in general words that end with "s" in singular form should be added bellow
_SINGULAR_EXCEPTIONS_PT = invert_dict(_PLURAL_EXCEPTIONS_PT)

# constants for number handling
_NUMBERS_PT = {
"zero": 0,
"um": 1,
Expand Down
20 changes: 19 additions & 1 deletion lingua_franca/lang/format_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

import inflection
from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount
from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \
_FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, _LONG_ORDINAL_EN
Expand Down Expand Up @@ -411,3 +411,21 @@ def get_plural_category_en(amount, type=PluralCategory.CARDINAL):

else:
return ValueError("Argument \"type\" must be cardinal|ordinal|range")


def get_plural_form_en(word, amount, type=PluralCategory.CARDINAL):
"""
Get plural form of the specified word for the specified amount.
Args:
word(str): Word to be pluralized.
amount(int or float or pair or list): The amount that is used to
determine the category. If type is range, it must contain
the start and end numbers.
type(str): Either cardinal (default), ordinal or range.
Returns:
(str): Pluralized word.
"""
if amount == 1:
return inflection.singularize(word)
return inflection.pluralize(word)
84 changes: 82 additions & 2 deletions lingua_franca/lang/format_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
# limitations under the License.
#

from lingua_franca.lang.format_common import convert_to_mixed_fraction
from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount
from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \
_NUM_STRING_PT
_NUM_STRING_PT, _VOWELS_PT, _PLURAL_EXCEPTIONS_PT, _SINGULAR_EXCEPTIONS_PT, _INVARIANTS_PT


def nice_number_pt(number, speech, denominators=range(1, 21)):
Expand Down Expand Up @@ -221,3 +221,83 @@ def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False):
elif hour != 0 and hour != 12:
speak += " da noite"
return speak


def _singularize_pt(word):
if word in _INVARIANTS_PT:
return _INVARIANTS_PT[word]
if word in _SINGULAR_EXCEPTIONS_PT:
return _SINGULAR_EXCEPTIONS_PT[word]
# TODO implement is_plural helper
# can not ensure word is in plural, assuming it is,
# if in singular form it might in some cases be wrongly mutated
# in general words that end with "s" in singular form should be added to exceptions dict
if word.endswith("is"):
return word.rstrip("is") + "il"
if word.endswith("ões"):
return word.replace("ões", "ão")
if word.endswith("ães"):
return word.replace("ães", "ão")
if word.endswith("es"):
return word.rstrip("es")
if word.endswith("s"):
return word.rstrip("s")
return word


def _pluralize_pt(word):
if word in _INVARIANTS_PT:
return word
if word in _PLURAL_EXCEPTIONS_PT:
return _PLURAL_EXCEPTIONS_PT[word]
if word.endswith("x"):
return word
if word.endswith("s"):
# TODO - this will catch too many words, need a better check
#if word[-2] in _VOWELS_PT or word[-3] in _VOWELS_PT:
# if word is an oxytone, add "es", else word remains unchanged
# https://en.wikipedia.org/wiki/Oxytone
# return word + "es"
return word
if word.endswith("ão"):
# crap, can either end with "ãos", "aẽs" or "ões", most times they are all valid
# the other times lets hope the word is in exceptions dict
# TODO check if numeric, then it's always "ões"
return word + "s"
if word[-1] in _VOWELS_PT:
# if word ends with a vowel add an "s"
return word + 's'
for ending in ["r", "z", "n"]:
if word.endswith(ending):
return word + "es"
for ending in ["al", "el", "ol", "ul"]:
if word.endswith(ending):
return word.rstrip("l") + "is"
if word.endswith("il"):
return word.rstrip("l") + "s"
if word.endswith("m"):
return word.rstrip("m") + "ns"
# foreign words that have been "unportuguesified" have an "s" added
# simple check is looking for endings that don't exist in portuguese
for ending in ["w", "y", "k", "t"]:
if word.endswith(ending):
return word + "s"
return word


def get_plural_form_pt(word, amount, type=PluralCategory.CARDINAL):
"""
Get plural form of the specified word for the specified amount.
Args:
word(str): Word to be pluralized.
amount(int or float or pair or list): The amount that is used to
determine the category. If type is range, it must contain
the start and end numbers.
type(str): Either cardinal (default), ordinal or range.
Returns:
(str): Pluralized word.
"""
if amount == 1:
return _singularize_pt(word)
return _pluralize_pt(word)
24 changes: 24 additions & 0 deletions lingua_franca/res/text/pt-pt/pluralizations.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"day": {
"one": "dia",
"other": "dias"
},
"hour": {
"one": "hora",
"other": "horas"
},
"minute": {
"one": "minuto",
"other": "minutos"
},
"second": {
"one": "segundo",
"other": "segundos"
},
"and": {
"one": "e"
},
"or": {
"one": "ou"
}
}
3 changes: 2 additions & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
python-dateutil~=2.6
rapidfuzz
rapidfuzz
inflection
58 changes: 38 additions & 20 deletions test/unittests/test_format_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
import datetime
import sys
import unittest

# TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES,
# or make it public somehow
from lingua_franca import load_language, unload_language, set_default_lang, \
get_primary_lang_code, get_active_langs, get_supported_langs
from lingua_franca.internal import UnsupportedLanguageError
from lingua_franca.format import nice_number
from lingua_franca.format import nice_time
from lingua_franca.format import nice_date
from lingua_franca.format import nice_date_time
from lingua_franca import load_language, unload_language, set_default_lang
from lingua_franca.format import get_plural_category
from lingua_franca.format import join_list, get_plural_form
from lingua_franca.format import nice_duration
from lingua_franca.format import nice_number, get_plural_category
from lingua_franca.format import nice_number
from lingua_franca.format import nice_time
from lingua_franca.format import nice_year
from lingua_franca.format import nice_duration
from lingua_franca.format import pronounce_number
from lingua_franca.format import date_time_format
from lingua_franca.format import join_list
from lingua_franca.format import pronounce_lang
from lingua_franca.time import default_timezone, set_default_tz, now_local, \
to_local

from lingua_franca.format import pronounce_number
from lingua_franca.time import default_timezone


def setUpModule():
Expand Down Expand Up @@ -80,7 +71,6 @@ def tearDownModule():


class TestNiceNumberFormat(unittest.TestCase):

tmp_var = None

def set_tmp_var(self, val):
Expand Down Expand Up @@ -372,7 +362,7 @@ def test_ordinals(self):
class TestNiceDateFormat(unittest.TestCase):

def test_convert_times(self):
dt = datetime.datetime(2017, 1, 31,
dt = datetime.datetime(2017, 1, 31,
13, 22, 3, tzinfo=default_timezone())

# Verify defaults haven't changed
Expand Down Expand Up @@ -495,7 +485,6 @@ def test_convert_times(self):
self.assertEqual(nice_time(dt),
"quarter to two")


def test_nice_duration(self):
self.assertEqual(nice_duration(1), "one second")
self.assertEqual(nice_duration(3), "three seconds")
Expand Down Expand Up @@ -576,5 +565,34 @@ def test_range_numbers(self):
self.assertEqual(get_plural_category((0, 2), type="range"), "other")


class TestInflection(unittest.TestCase):
def test_singularize(self):
self.assertEqual(get_plural_form("posts", 1), "post")
self.assertEqual(get_plural_form("octopi", 1), "octopus")
self.assertEqual(get_plural_form("sheep", 1), "sheep")
# test already singular
self.assertEqual(get_plural_form("word", 1), "word")
# test garbage
self.assertEqual(get_plural_form("CamelOctopi", 1), "CamelOctopus")

def test_pluralize(self):
self.assertEqual(get_plural_form("post", 2), "posts")
self.assertEqual(get_plural_form("octopus", 3), "octopi")
self.assertEqual(get_plural_form("sheep", 4), "sheep")
# test already plural
self.assertEqual(get_plural_form("words", 5), "words")
# irregular verbs
self.assertEqual(get_plural_form("person", 6), "people")
self.assertEqual(get_plural_form("man", 2), "men")
self.assertEqual(get_plural_form("human", 3), "humans")
self.assertEqual(get_plural_form('child', 4), 'children')
self.assertEqual(get_plural_form('sex', 2), 'sexes')
self.assertEqual(get_plural_form('move', 3), 'moves')
self.assertEqual(get_plural_form('cow', 4), 'kine')
self.assertEqual(get_plural_form('zombie', 5), 'zombies')
# test garbage
self.assertEqual(get_plural_form("CamelOctopus", 6), "CamelOctopi")


if __name__ == "__main__":
unittest.main()
31 changes: 31 additions & 0 deletions test/unittests/test_format_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from lingua_franca import load_language, unload_language, set_default_lang
from lingua_franca.format import nice_time
from lingua_franca.format import pronounce_number
from lingua_franca.format import get_plural_form
from lingua_franca.time import default_timezone


Expand Down Expand Up @@ -309,5 +310,35 @@ def test_minutes_past_hour(self):
"onze e um quarto da noite")


class TestInflection(unittest.TestCase):
def test_singularize(self):
self.assertEqual(get_plural_form("homems", 1), "homem")
self.assertEqual(get_plural_form("cavalos", 1), "cavalo")
self.assertEqual(get_plural_form("ovelhas", 1), "ovelha")
# test already singular
self.assertEqual(get_plural_form("palavra", 1), "palavra")
# test garbage
self.assertEqual(get_plural_form("gerubicios", 1), "gerubicio")

def test_pluralize(self):
self.assertEqual(get_plural_form("poste", 2), "postes")
self.assertEqual(get_plural_form("polvo", 3), "polvos")
self.assertEqual(get_plural_form("ovelha", 4), "ovelhas")
# test already plural
self.assertEqual(get_plural_form("palavras", 5), "palavras")
self.assertEqual(get_plural_form("ovelhas", 3), "ovelhas")
# irregular/invariant verbs
self.assertEqual(get_plural_form("anão", 6), "anões")
self.assertEqual(get_plural_form("alemão", 2), "alemães")
self.assertEqual(get_plural_form("apêndix", 3), "apêndices")
self.assertEqual(get_plural_form('três', 4), 'três')
self.assertEqual(get_plural_form('seis', 2), 'seis')
self.assertEqual(get_plural_form('ontem', 3), 'ontem')
self.assertEqual(get_plural_form('depressa', 4), 'depressa')
self.assertEqual(get_plural_form('contra', 5), 'contra')
# test garbage
self.assertEqual(get_plural_form("gerubicio", 6), "gerubicios")


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 36c4de1

Please sign in to comment.