diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4645d13..7645ec9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,7 +100,7 @@ jobs: - name: Publish benchmark results uses: benchmark-action/github-action-benchmark@v1 - if: github.event_name == 'pull_request' && github.repository == 'ixc/python-edtf' + if: github.event_name == 'pull_request' && github.repository == 'rism-digital/python-edtf' with: tool: 'pytest' auto-push: true @@ -112,7 +112,7 @@ jobs: summary-always: true - name: Comment on benchmark results without publishing - if: github.event_name != 'pull_request' || github.repository != 'ixc/python-edtf' + if: github.event_name != 'pull_request' || github.repository != 'rism-digital/python-edtf' uses: benchmark-action/github-action-benchmark@v1 with: tool: 'pytest' diff --git a/.gitignore b/.gitignore index 36df893..d27f79d 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store diff --git a/edtf/convert.py b/edtf/convert.py index ee03f36..db86155 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -70,8 +70,7 @@ def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time: """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) def struct_time_to_jd(st: struct_time) -> float: @@ -116,7 +115,7 @@ def jd_to_struct_time(jd: float) -> struct_time: return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -152,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index f28e685..97230db 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,7 +1,9 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools import re from datetime import datetime +from typing import Optional from dateutil.parser import ParserError, parse @@ -13,19 +15,46 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])" -LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)" -CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" -CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)" +SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" +) +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") + +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) + +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") + +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -def text_to_edtf(text): +@functools.lru_cache +def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. """ @@ -35,7 +64,7 @@ def text_to_edtf(text): t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: Optional[str] = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. @@ -43,7 +72,8 @@ def text_to_edtf(text): for split in [",", ";", "or"]: for list_item in t.split(split): # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -51,19 +81,20 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9 + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 if re.search( - r"\b\d\d\d\d$", d1 + PARTIAL_CHECK, d1 ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10 - if re.search(r"\b\d\d\d\d$", d1): + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search( - r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]", - f"{d1}-{d2}", - ) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() d1 = f"{g[0]}C" @@ -82,7 +113,7 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: return f"[{int_match.group(1)}, {int_match.group(2)}]" @@ -92,21 +123,19 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r"\bbefore\b", t) - is_before = is_before or re.findall(r"\bearlier\b", t) - - is_after = re.findall(r"\bafter\b", t) - is_after = is_after or re.findall(r"\bsince\b", t) - is_after = is_after or re.findall(r"\blater\b", t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"/{result}" # unknown is replaced with null for intervals + result = f"/{result}" elif is_after: - result = f"{result}/" # unknown is replaced with null for intervals + result = f"{result}/" + return result -def text_to_edtf_date(text): +@functools.lru_cache +def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -115,37 +144,28 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() - result = "" + result: str = "" - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r"(\d{2}00)s", t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r"\bcirca\b", t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r"\b~\d{4}", t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r"^~", t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -154,31 +174,28 @@ def text_to_edtf_date(text): is_ce = re.findall(CE_RE, t) if is_century: result = "%02dXX" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t) + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: + if is_century[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass elif is_ce: result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t) + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: + if is_ce[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. @@ -205,15 +222,15 @@ def text_to_edtf_date(text): if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r"\byear\b.+(in|during)\b", t) - mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t) - mentions_day = re.findall(r"\bday\b.+(in|during)\b", t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use @@ -221,18 +238,18 @@ def text_to_edtf_date(text): # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): result += "X" - elif i == 3 and is_decade > 0: + elif i == 3 and is_decade: if mentions_year: - result += "X" # previously year precision - now just X + result += "X" # year precision else: - result += "X" # previously decade precision - now just X + result += "X" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default result += date1[i] else: # different values were produced, meaning that it's likely - # a default. Use 'X' + # a default. Use 'unspecified' result += "X" # strip off unknown chars from end of string - except the first 4 diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index ed03355..ad690fb 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -4,6 +4,7 @@ from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta @@ -27,7 +28,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> dict: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -125,7 +126,7 @@ def __init__(self, *args, **kwargs): def __str__(self): raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str): raise NotImplementedError def lower_strict(self): @@ -134,7 +135,7 @@ def lower_strict(self): def upper_strict(self): return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str): """ Subclasses should override this to pad based on how precise they are. """ @@ -276,6 +277,26 @@ def __init__( int(significant_digits) if significant_digits else None ) + def set_year(self, y: int): + if y is None: + raise AttributeError("Year must not be None") + self._year = y + + def get_year(self) -> int: + return self._year + + year = property(get_year, set_year) + + def set_month(self, m: Optional[int]): + self._month = m + if m is None: + self.day = None + + def get_month(self) -> Optional[int]: + return self._month + + month = property(get_month, set_month) + def __str__(self): r = self.year if self.month: @@ -749,7 +770,7 @@ def __str__(self): def _precise_year(self): return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str): py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) @@ -811,8 +832,8 @@ def _precise_month(self, lean): rng = appsettings.SEASON_L2_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) @@ -909,12 +930,17 @@ def __str__(self): return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + + year = property(Date.get_year, set_year) + + def _precise_year(self, lean: str): if self.season: return self.season._precise_year(lean) return super()._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.season: return self.season._precise_month(lean) return super()._precise_month(lean) @@ -1018,14 +1044,14 @@ def __str__(self): class OneOfASet(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): return "[{}]".format(", ".join([str(o) for o in self.objects])) @@ -1051,34 +1077,34 @@ def _strict_date(self, lean): class MultipleDates(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): return "{{{}}}".format(", ".join([str(o) for o in self.objects])) def _strict_date(self, lean): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + return min([x._strict_date(lean) for x in self.objects]) class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. if isinstance(lower, (tuple, list)) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower + if isinstance(lower, (tuple, list)) and len(upper) == 1: self.upper = upper[0] else: diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..c4b40b6 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,45 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + +[[package]] +name = "pyparsing" +version = "3.1.2" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.9" +content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200"