Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update natural language parser #48

Merged
merged 8 commits into from
May 22, 2024
Merged
50 changes: 20 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ Test coverage includes every example given in the spec table of features.

* Years exceeding four digits:

>>> parse_edtf('y-12000') # 12000 years BCE
LongYear: 'y-12000'
>>> parse_edtf('Y-12000') # 12000 years BCE
LongYear: 'Y-12000'

* Season:

Expand Down Expand Up @@ -167,8 +167,8 @@ Test coverage includes every example given in the spec table of features.

* Year requiring more than 4 digits - exponential form:

>>> parse_edtf('y-17e7')
ExponentialYear: 'y-17e7'
>>> parse_edtf('Y-17e7')
ExponentialYear: 'Y-17e7'

### Natural language representation

Expand Down Expand Up @@ -196,43 +196,33 @@ The parser can parse strings such as:
'c.1860' => '1860~' #with or without .
'ca1860' => '1860~'
'approx 1860' => '1860~'

# masked precision
'1860s' => '186x' #186x has decade precision, 186u has year precision.
'1800s' => '18xx' # without uncertainty indicators, assume century

# masked precision + uncertainty
'ca. 1860s' => '186x~'
'circa 1840s' => '184x~'
'ca. 1860s?' => '186x?~'
'c1800s?' => '180x?~' # with uncertainty indicators, use the decade
'ca. 1860s' => '186X~'
'circa 1840s' => '184X~'
'ca. 1860s?' => '186X?~'
'c1800s?' => '180X?~' # with uncertainty indicators, use the decade

# unspecified parts
'January 12' => 'XXXX-01-12'
'January' => 'XXXX-01'
'7/2008' => '2008-07'
'month in 1872' => '1872-XX'
'day in January 1872' => '1872-01-XX'
'day in 1872' => '1872-XX-XX'

#seasons
'Autumn 1872' => '1872-23'
'Fall 1872' => '1872-23'

# before/after
'earlier than 1928' => 'unknown/1928'
'later than 1928' => '1928/unknown'
'before January 1928' => 'unknown/1928-01'
'after about the 1920s' => '192x~/unknown'

# unspecified
'year in the 1860s' => '186u' #186x has decade precision, 186u has year precision.
('year in the 1800s', '18xu')
'month in 1872' => '1872-XX'
'day in January 1872' => '1872-01-XX'
'day in 1872' => '1872-XX-XX'
'earlier than 1928' => '/1928'
'later than 1928' => '1928/'
'before January 1928' => '/1928-01'
'after about the 1920s' => '192X~/'

#centuries
'1st century' => '00xx'
'10c' => '09xx'
'19th century?' => '18xx?'
'1st century' => '00XX'
'10c' => '09XX'
'19th century?' => '18XX?'

# just showing off now...
'a day in about Spring 1849?' => '1849-21-XX?~'
Expand All @@ -243,8 +233,8 @@ The parser can parse strings such as:
'1851-1852; printed 1853-1854' => '1851/1852'
'1851-52' => '1851/1852'
'1856-ca. 1865' => '1856/1865~'
'1860s-1870s' => '186x/187x'
'1920s -early 1930s' => '192x/193x'
'1860s-1870s' => '186X/187X'
'1920s - early 1930s' => '192X/193X'
'1938, printed 1940s-1950s' => '1938'


Expand Down
39 changes: 20 additions & 19 deletions edtf/natlang/en.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)

SHORT_YEAR_RE = r"(-?)([\du])([\dxu])([\dxu])([\dxu])"
LONG_YEAR_RE = r"y(-?)([1-9]\d\d\d\d+)"
SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"

Expand All @@ -31,7 +31,7 @@ def text_to_edtf(text):
Generate EDTF string equivalent of a given natural language date string.
"""
if not text:
return
return None

t = text.lower()

Expand Down Expand Up @@ -101,10 +101,9 @@ def text_to_edtf(text):
is_after = is_after or re.findall(r"\blater\b", t)

if is_before:
result = f"unknown/{result}"
result = f"/{result}" # unknown is replaced with null for intervals
elif is_after:
result = f"{result}/unknown"

result = f"{result}/" # unknown is replaced with null for intervals
return result


Expand Down Expand Up @@ -155,7 +154,7 @@ def text_to_edtf_date(text):
# detect CE/BCE year form
is_ce = re.findall(CE_RE, t)
if is_century:
result = "%02dxx" % (int(is_century[0][0]) - 1,)
result = "%02dXX" % (int(is_century[0][0]) - 1,)
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)

Expand Down Expand Up @@ -222,25 +221,25 @@ def text_to_edtf_date(text):
# approximate/uncertain markers to decide whether we treat it as
# a century or a decade.
if i == 2 and could_be_century and not (is_approximate or is_uncertain):
result += "x"
result += "X"
elif i == 3 and is_decade > 0:
if mentions_year:
result += "u" # year precision
result += "X" # previously year precision - now just X
else:
result += "x" # decade precision
result += "X" # previously decade precision - now just X
elif date1[i] == date2[i]:
# since both attempts at parsing produced the same result
# it must be parsed value, not a default
result += date1[i]
else:
# different values were produced, meaning that it's likely
# a default. Use 'unspecified'
result += "u"
# a default. Use 'X'
result += "X"

# strip off unknown chars from end of string - except the first 4

for i in reversed(xrange(len(result))):
if result[i] not in ("u", "x", "-"):
if result[i] not in ("X", "-"):
smallest_length = 4

if mentions_month:
Expand All @@ -264,14 +263,16 @@ def text_to_edtf_date(text):

# end dateutil post-parsing

if is_uncertain:
result += "?"

if is_approximate:
result += "~"
if is_uncertain and is_approximate:
result += "%"
else:
if is_uncertain:
result += "?"
if is_approximate:
result += "~"

# weed out bad parses
if result.startswith("uu-uu"):
if result.startswith("XX-XX"):
return None

return result
Loading