Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I have implemented parse_roman() function #64

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
61f57d7
Implemented parse_roman() function
AmPhIbIaN26 Apr 17, 2021
6664b6f
Moved test_numeral_roman.py
AmPhIbIaN26 Apr 17, 2021
376db40
Implemented roman_numera()l function
AmPhIbIaN26 Apr 18, 2021
ce4e5c6
Adding numeral_system as a parameter to functions
AmPhIbIaN26 Apr 24, 2021
6d420a7
added _valid_input_by_numeral_system()
AmPhIbIaN26 Apr 28, 2021
bf7d6e3
Added NUMERAL_SYSTEMS
AmPhIbIaN26 May 3, 2021
5104b76
fixed Unicode issue for Hindi, Spanish and Russian
AmPhIbIaN26 May 3, 2021
94d9441
Update test_numeral_roman.py
AmPhIbIaN26 May 3, 2021
475ad04
Update test_number_parsing.py
AmPhIbIaN26 May 3, 2021
f847cf6
Merge pull request #1 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 3, 2021
45bfdc5
Update README.rst
AmPhIbIaN26 May 4, 2021
c13102f
Merge pull request #2 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 4, 2021
973664b
Delete test.py
AmPhIbIaN26 May 4, 2021
0e89680
Made all the changes and added support for incorrect roman numbers
AmPhIbIaN26 May 6, 2021
9704eff
Removed .lower() from roman regex expressions
AmPhIbIaN26 May 6, 2021
554d553
Update test_numeral_systems.py
AmPhIbIaN26 May 6, 2021
77ae66f
Merge pull request #3 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 6, 2021
fb9bff7
Update number_parser/parser.py
AmPhIbIaN26 May 8, 2021
480cf7c
Update number_parser/parser.py
AmPhIbIaN26 May 8, 2021
faecf42
Merge pull request #4 from AmPhIbIaN26/parse_roman(regex-approach)
AmPhIbIaN26 May 8, 2021
cbc4661
Added more test cases for better code coverage
AmPhIbIaN26 May 12, 2021
086f5ec
Merge pull request #5 from AmPhIbIaN26/parse_roman(numeral-support)
AmPhIbIaN26 May 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion number_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction
from number_parser.parser import parse, parse_number, parse_ordinal, parse_fraction, NUMERAL_SYSTEMS
53 changes: 51 additions & 2 deletions number_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
from importlib import import_module
import unicodedata

SENTENCE_SEPARATORS = [".", ","]
SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
RE_BUG_LANGUAGES = ['hi']
NUMERAL_SYSTEMS = ['decimal', 'roman']


class LanguageData:
Expand Down Expand Up @@ -249,6 +251,9 @@ def parse_number(input_string, language=None):
if input_string.strip().isnumeric():
return int(input_string)

if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower()):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
return int(_parse_roman(input_string))

if language is None:
language = _valid_tokens_by_language(input_string)

Expand Down Expand Up @@ -298,14 +303,33 @@ def parse_fraction(input_string, language=None):
return None


def parse(input_string, language=None):
def parse(input_string, language=None, numeral_systems=None):
"""
Converts all the numbers in a sentence written in natural language to their numeric type while keeping
the other words unchanged. Returns the transformed string.
"""
global complete_sentence
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if numeral_systems is None:
numeral_systems = ['decimal', 'roman']
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if language is None:
language = _valid_tokens_by_language(input_string)

for numeral_system in numeral_systems:

if numeral_system == 'decimal':
complete_sentence = _parse_decimal(input_string, language)
input_string = complete_sentence

if numeral_system == 'roman':
complete_sentence = _parse_roman(input_string)
input_string = complete_sentence
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

return complete_sentence


def _parse_decimal(input_string, language):
lang_data = LanguageData(language)

tokens = _tokenize(input_string, language)
Expand Down Expand Up @@ -359,8 +383,33 @@ def _build_and_add_number(pop_last_space=False):

_build_and_add_number()
current_sentence.append(token)

_build_and_add_number()

final_sentence.extend(current_sentence)
return ''.join(final_sentence).strip()


def _parse_roman(input_string):
tokens = _tokenize(input_string, None)
for token in tokens:
if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()):
if _build_roman(token) != 0:
tokens[tokens.index(token)] = str(_build_roman(token))
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
final_sentence = ''.join(tokens)

return final_sentence


def _build_roman(roman_number):
roman = {'i': 1, 'v': 5, 'x': 10, 'l': 50, 'c': 100, 'd': 500, 'm': 1000}
num_tokens = re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", roman_number.lower())
num_tokens = [item for item in num_tokens if item != '']
built_num = 0
for num_token in num_tokens:
if re.search('iv|ix|xl|xc|cd|cm', num_token):
built_num += roman[num_token[1]] - roman[num_token[0]]
elif re.search('[vld][ixc]{1,3}', num_token):
built_num += roman[num_token[0]] + (roman[num_token[1]] * (len(num_token) - 1))
else:
built_num += roman[num_token[0]] * len(num_token)
return built_num
3 changes: 3 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from number_parser import parse ,parse_number

print(parse_number('built in CDXX', numeral_system='roman'))
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_test_files(path, prefix):
def _test_files(path, language, is_ordinal=True):
fnx = parse_ordinal if is_ordinal else parse_number
for filename in get_test_files(path, f'{language}_'):
with open(filename, "r") as csv_file:
with open(filename, "r", encoding="utf8") as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
try:
Expand Down
Loading