Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/decimal markers #43

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
return utterance


def normalize_decimals(text, decimal, lang=""):
"""
Replace 'decimal' with decimal periods so Python can floatify them
"""
regex = r"\b\d+" + decimal + r"{1}\d+\b"
sanitize_decimals = re.compile(regex)
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
text = text.replace(match.group(
0), match.group(0).replace(decimal, '.'))
return text


def match_yes_or_no(text, lang):
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
if not resource_file:
Expand Down
19 changes: 16 additions & 3 deletions lingua_franca/lang/parse_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
_ORDINAL_BASE_CS # _ARTICLES_CS

from lingua_franca.lang.parse_common import normalize_decimals
import re
import json
from lingua_franca import resolve_resource_file
Expand Down Expand Up @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_cs, string_num_scale_cs


def extract_number_cs(text, short_scale=True, ordinals=False):
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return _extract_number_with_text_cs(tokenize(text.lower()),
short_scale, ordinals).value

Expand Down Expand Up @@ -1560,7 +1566,7 @@ def isFractional_cs(input_str, short_scale=True):
return False


def extract_numbers_cs(text, short_scale=True, ordinals=False):
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -1571,9 +1577,16 @@ def extract_numbers_cs(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
results = _extract_numbers_with_text_cs(tokenize(text),
short_scale, ordinals)
return [float(result.value) for result in results]
Expand Down
23 changes: 20 additions & 3 deletions lingua_franca/lang/parse_da.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,32 @@
from lingua_franca.lang.common_data_da import _DA_NUMBERS
from lingua_franca.lang.format_da import pronounce_number_da
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


def extract_number_da(text, short_scale=True, ordinals=False):
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.


undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -869,7 +879,7 @@ def normalize_da(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_da(text, short_scale=True, ordinals=False):
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -880,9 +890,16 @@ def extract_numbers_da(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
short_scale=short_scale, ordinals=ordinals)

Expand Down
31 changes: 23 additions & 8 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lingua_franca.lang.common_data_de import _DE_NUMBERS
from lingua_franca.lang.format_de import pronounce_number_de
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


de_numbers = {
Expand Down Expand Up @@ -143,20 +144,31 @@ def repl(match):
return (duration, text)


def extract_number_de(text, short_scale=True, ordinals=False):
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale

https://en.wikipedia.org/wiki/Names_of_large_numbers

Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number

(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'
undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -1003,7 +1015,7 @@ def normalize_de(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_de(text, short_scale=True, ordinals=False):
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -1014,9 +1026,12 @@ def extract_numbers_de(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
short_scale=short_scale, ordinals=ordinals)

Expand Down
17 changes: 15 additions & 2 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
Expand Down Expand Up @@ -529,7 +530,7 @@ def _initialize_number_data_en(short_scale, speech=True):
return multiplies, string_num_ordinal_en, string_num_scale_en


def extract_number_en(text, short_scale=True, ordinals=False):
def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -540,11 +541,17 @@ def extract_number_en(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return _extract_number_with_text_en(tokenize(text.lower()),
short_scale, ordinals).value

Expand Down Expand Up @@ -1655,7 +1662,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True):
return False


def extract_numbers_en(text, short_scale=True, ordinals=False):
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -1666,9 +1673,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
results = _extract_numbers_with_text_en(tokenize(text),
short_scale, ordinals)
return [float(result.value) for result in results]
Expand Down
30 changes: 25 additions & 5 deletions lingua_franca/lang/parse_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from lingua_franca.lang.format_es import pronounce_number_es
from lingua_franca.lang.parse_common import *
from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
from lingua_franca.lang.parse_common import normalize_decimals


def is_fractional_es(input_str, short_scale=True):
Expand Down Expand Up @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True):
return False


def extract_number_es(text, short_scale=True, ordinals=False):
def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale

https://en.wikipedia.org/wiki/Names_of_large_numbers

Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
Comment on lines +60 to +77
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review of extract_number_es function signature and decimal handling

The function now correctly includes a decimal parameter allowing for flexible decimal character specification. The implementation of normalize_decimals based on this parameter is a good approach to handle locale-specific decimal characters.

However, the note in the documentation about always extracting numbers formatted with a decimal dot/full stop might be confusing. It suggests that despite the decimal parameter, the function may not respect it under certain conditions. This could benefit from further clarification to ensure users understand the behavior fully.


"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -268,7 +281,7 @@ def es_number(i):
return es_number(i)


def extract_numbers_es(text, short_scale=True, ordinals=False):
def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -279,9 +292,16 @@ def extract_numbers_es(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_es,
extract_number_es, short_scale=short_scale,
ordinals=ordinals)
Expand Down
10 changes: 9 additions & 1 deletion lingua_franca/lang/parse_eu.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from lingua_franca.lang.format_eu import pronounce_number_eu
from lingua_franca.lang.parse_common import *
from lingua_franca.lang.common_data_eu import _NUM_STRING_EU
from lingua_franca.lang.parse_common import normalize_decimals


def isFractional_eu(input_str):
Expand Down Expand Up @@ -283,7 +284,7 @@ def eu_number(i):
return eu_number(i)


def extract_numbers_eu(text, short_scale=True, ordinals=False):
def extract_numbers_eu(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -294,9 +295,16 @@ def extract_numbers_eu(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_eu, extract_number_eu,
short_scale=short_scale, ordinals=ordinals)

Expand Down
Loading