support decimal markers (#20)

rebase of MycroftAI#69 Co-authored-by: jarbasal <[email protected]>
HelloChatterbox · May 9, 2021 · cfbbd19 · cfbbd19
1 parent 79fe4a5
commit cfbbd19
Show file tree

Hide file tree

Showing 15 changed files with 350 additions and 171 deletions.
diff --git a/lingua_nostra/lang/parse_cs.py b/lingua_nostra/lang/parse_cs.py
@@ -23,7 +23,7 @@
  _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
  _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
  _ORDINAL_BASE_CS # _ARTICLES_CS
-
+from lingua_nostra.parse import normalize_decimals
 import re
 import json
 from lingua_nostra import resolve_resource_file
@@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
  return multiplies, string_num_ordinal_cs, string_num_scale_cs
 
 
-def extract_number_cs(text, short_scale=True, ordinals=False):
+def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
  """
  This function extracts a number from a text string,
  handles pronunciations in long scale and short scale
@@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
  text (str): the string to normalize
  short_scale (bool): use short scale if True, long scale if False
  ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
  (int) or (float) or False: The extracted number or False if no number
  was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
 
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  return _extract_number_with_text_cs(tokenize(text.lower()),
  short_scale, ordinals).value
 
@@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True):
  return False
 
 
-def extract_numbers_cs(text, short_scale=True, ordinals=False):
+def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
  """
  Takes in a string and extracts a list of numbers.
 
  Args:
- text (str): the string to extract a number from
- short_scale (bool): Use "short scale" or "long scale" for large
- numbers -- over a million. The default is short scale, which
- is now common in most English speaking countries.
- See https://en.wikipedia.org/wiki/Names_of_large_numbers
- ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
+ text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- list: list of extracted numbers as floats
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
+
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  results = _extract_numbers_with_text_cs(tokenize(text),
  short_scale, ordinals)
  return [float(result.value) for result in results]

diff --git a/lingua_nostra/lang/parse_da.py b/lingua_nostra/lang/parse_da.py
@@ -20,22 +20,31 @@
 from lingua_nostra.lang.common_data_da import _DA_NUMBERS
 from lingua_nostra.lang.format_da import pronounce_number_da
 from lingua_nostra.time import now_local
+from lingua_nostra.parse import normalize_decimals
 
 
-def extract_number_da(text, short_scale=True, ordinals=False):
+def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
  """
- This function prepares the given text for parsing by making
- numbers consistent, getting rid of contractions, etc.
+ This function extracts a number from a text string,
+ handles pronunciations in long scale and short scale
+
+ https://en.wikipedia.org/wiki/Names_of_large_numbers
+
  Args:
  text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- (int) or (float): The value of extracted number
-
-
- undefined articles cannot be suppressed in German:
- 'ein Pferd' means 'one horse' and 'a horse'
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+  will always extract numbers formatted with a decimal dot/full stop,
+  such as '3.5', even if 'decimal' is specified.
 
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  # TODO: short_scale and ordinals don't do anything here.
  # The parameters are present in the function signature for API compatibility
  # reasons.
@@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True):
  return normalized[1:] # strip the initial space
 
 
-def extract_numbers_da(text, short_scale=True, ordinals=False):
+def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
  """
  Takes in a string and extracts a list of numbers.
 
- Args:
- text (str): the string to extract a number from
- short_scale (bool): Use "short scale" or "long scale" for large
- numbers -- over a million. The default is short scale, which
- is now common in most English speaking countries.
- See https://en.wikipedia.org/wiki/Names_of_large_numbers
- ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
+ Args:
+ text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- list: list of extracted numbers as floats
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
+
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
  short_scale=short_scale, ordinals=ordinals)
 

diff --git a/lingua_nostra/lang/parse_de.py b/lingua_nostra/lang/parse_de.py
@@ -21,6 +21,7 @@
 from lingua_nostra.lang.common_data_de import _DE_NUMBERS
 from lingua_nostra.lang.format_de import pronounce_number_de
 from lingua_nostra.time import now_local
+from lingua_nostra.parse import normalize_decimals
 
 
 de_numbers = {
@@ -143,20 +144,28 @@ def repl(match):
  return (duration, text)
 
 
-def extract_number_de(text, short_scale=True, ordinals=False):
+def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
  """
- This function prepares the given text for parsing by making
- numbers consistent, getting rid of contractions, etc.
+ This function extracts a number from a text string,
+ handles pronunciations in long scale and short scale
+
+ https://en.wikipedia.org/wiki/Names_of_large_numbers
+
  Args:
  text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- (int) or (float): The value of extracted number
-
-
- undefined articles cannot be suppressed in German:
- 'ein Pferd' means 'one horse' and 'a horse'
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+  will always extract numbers formatted with a decimal dot/full stop,
+  such as '3.5', even if 'decimal' is specified.
 
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  # TODO: short_scale and ordinals don't do anything here.
  # The parameters are present in the function signature for API compatibility
  # reasons.
@@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True):
  return normalized[1:] # strip the initial space
 
 
-def extract_numbers_de(text, short_scale=True, ordinals=False):
- """
- Takes in a string and extracts a list of numbers.
-
- Args:
- text (str): the string to extract a number from
- short_scale (bool): Use "short scale" or "long scale" for large
- numbers -- over a million. The default is short scale, which
- is now common in most English speaking countries.
- See https://en.wikipedia.org/wiki/Names_of_large_numbers
- ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
- Returns:
- list: list of extracted numbers as floats
+def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
  """
+ This function extracts a number from a text string,
+ handles pronunciations in long scale and short scale
+
+ https://en.wikipedia.org/wiki/Names_of_large_numbers
+
+ Args:
+ text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
+ Returns:
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
+
+ """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
  short_scale=short_scale, ordinals=ordinals)
 

diff --git a/lingua_nostra/lang/parse_en.py b/lingua_nostra/lang/parse_en.py
@@ -30,6 +30,7 @@
 import re
 import json
 from lingua_nostra.internal import resolve_resource_file
+from lingua_nostra.parse import normalize_decimals
 
 
 def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
@@ -530,7 +531,7 @@ def _initialize_number_data_en(short_scale, speech=True):
  return multiplies, string_num_ordinal_en, string_num_scale_en
 
 
-def extract_number_en(text, short_scale=True, ordinals=False):
+def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'):
  """
  This function extracts a number from a text string,
  handles pronunciations in long scale and short scale
@@ -541,11 +542,17 @@ def extract_number_en(text, short_scale=True, ordinals=False):
  text (str): the string to normalize
  short_scale (bool): use short scale if True, long scale if False
  ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
  (int) or (float) or False: The extracted number or False if no number
  was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
 
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  return _extract_number_with_text_en(tokenize(text.lower()),
  short_scale, ordinals).value
 
@@ -1453,7 +1460,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True):
  return False
 
 
-def extract_numbers_en(text, short_scale=True, ordinals=False):
+def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'):
  """
  Takes in a string and extracts a list of numbers.
 
@@ -1464,9 +1471,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
  is now common in most English speaking countries.
  See https://en.wikipedia.org/wiki/Names_of_large_numbers
  ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
  list: list of extracted numbers as floats
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  results = _extract_numbers_with_text_en(tokenize(text),
  short_scale, ordinals)
  return [float(result.value) for result in results]

diff --git a/lingua_nostra/lang/parse_es.py b/lingua_nostra/lang/parse_es.py
@@ -20,6 +20,7 @@
 from lingua_nostra.lang.format_es import pronounce_number_es
 from lingua_nostra.lang.parse_common import *
 from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
+from lingua_nostra.parse import normalize_decimals
 
 
 def is_fractional_es(input_str, short_scale=True):
@@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True):
  return False
 
 
-def extract_number_es(text, short_scale=True, ordinals=False):
+def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'):
  """
- This function prepares the given text for parsing by making
- numbers consistent, getting rid of contractions, etc.
+ This function extracts a number from a text string,
+ handles pronunciations in long scale and short scale
+
+ https://en.wikipedia.org/wiki/Names_of_large_numbers
+
  Args:
  text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- (int) or (float): The value of extracted number
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
 
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  # TODO: short_scale and ordinals don't do anything here.
  # The parameters are present in the function signature for API compatibility
  # reasons.
@@ -268,20 +281,25 @@ def es_number(i):
  return es_number(i)
 
 
-def extract_numbers_es(text, short_scale=True, ordinals=False):
+def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'):
  """
  Takes in a string and extracts a list of numbers.
 
- Args:
- text (str): the string to extract a number from
- short_scale (bool): Use "short scale" or "long scale" for large
- numbers -- over a million. The default is short scale, which
- is now common in most English speaking countries.
- See https://en.wikipedia.org/wiki/Names_of_large_numbers
- ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
+ Args:
+ text (str): the string to normalize
+ short_scale (bool): use short scale if True, long scale if False
+ ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
+ decimal (str): character to use as decimal point. defaults to '.'
  Returns:
- list: list of extracted numbers as floats
+ (int) or (float) or False: The extracted number or False if no number
+ was found
+ Note:
+ will always extract numbers formatted with a decimal dot/full stop,
+ such as '3.5', even if 'decimal' is specified.
+
  """
+ if decimal != '.':
+ text = normalize_decimals(text, decimal)
  return extract_numbers_generic(text, pronounce_number_es,
  extract_number_es, short_scale=short_scale,
  ordinals=ordinals)