diff --git a/htmldate/extractors.py b/htmldate/extractors.py index 2721ed72..564a0cfd 100644 --- a/htmldate/extractors.py +++ b/htmldate/extractors.py @@ -101,11 +101,15 @@ # .//*[(self::div or self::section)][@id="footer" or @class="footer"] # regex cache -YMD_NO_SEP_PATTERN = re.compile(r"(?:\D|^)(\d{8})(?:\D|$)") -YMD_PATTERN = re.compile(r"(?:\D|^)(\d{4})[\-/.](\d{1,2})[\-/.](\d{1,2})(?:\D|$)") -DMY_PATTERN = re.compile(r"(?:\D|^)(\d{1,2})[\-/.](\d{1,2})[\-/.](\d{2,4})(?:\D|$)") -YM_PATTERN = re.compile(r"(?:\D|^)(\d{4})[\-/.](\d{1,2})(?:\D|$)") -MY_PATTERN = re.compile(r"(?:\D|^)(\d{1,2})[\-/.](\d{4})(?:\D|$)") +YMD_NO_SEP_PATTERN = re.compile(r"\b(\d{8})\b") +YMD_PATTERN = re.compile( + r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})[\-/.](?P\d{1,2})(?:\D|$)|" + r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{1,2})[\-/.](?P\d{2,4})(?:\D|$)" +) +YM_PATTERN = re.compile( + r"(?:\D|^)(?P\d{4})[\-/.](?P\d{1,2})(?:\D|$)|" + r"(?:\D|^)(?P\d{1,2})[\-/.](?P\d{4})(?:\D|$)" +) REGEX_MONTHS = """ January|February|March|April|May|June|July|August|September|October|November|December| @@ -116,16 +120,10 @@ Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık| Oca|Şub|Mar|Nis|Haz|Tem|Ağu|Eyl|Eki|Kas|Ara """ # todo: check "août" -LONG_MDY_PATTERN = re.compile( - rf"""({REGEX_MONTHS})\s -([0-9]{{1,2}})(?:st|nd|rd|th)?,? ([0-9]{{4}})""".replace( - "\n", "" - ), - re.I, -) -LONG_DMY_PATTERN = re.compile( - rf"""([0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )? -({REGEX_MONTHS}),? ([0-9]{{4}})""".replace( +LONG_TEXT_PATTERN = re.compile( + rf"""(?P{REGEX_MONTHS})\s +(?P[0-9]{{1,2}})(?:st|nd|rd|th)?,? (?P[0-9]{{4}})|(?P[0-9]{{1,2}})(?:st|nd|rd|th|\.)? (?:of )? +(?P{REGEX_MONTHS}),? (?P[0-9]{{4}})""".replace( "\n", "" ), re.I, @@ -242,7 +240,7 @@ # \d[,.]\d+ # currency amounts # \b\d{5}\s # postal codes -# use of regex module for speed +# use of regex module for speed? EN_PATTERNS = re.compile( r'(?:date[^0-9"]{,20}|updated|published) *?(?:in)? *?:? *?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})', re.I, @@ -251,10 +249,8 @@ r"(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})", re.I ) TR_PATTERNS = re.compile( - r"""(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})| -([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)""".replace( - "\n", "" - ), + r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|" + r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", re.I, ) @@ -359,23 +355,27 @@ def regex_parse(string: str) -> Optional[datetime]: """Try full-text parse for date elements using a series of regular expressions with particular emphasis on English, French, German and Turkish""" # https://github.com/vi3k6i5/flashtext ? - # multilingual day-month-year pattern - match = LONG_DMY_PATTERN.search(string) - if match: - day, month, year = match[1], TEXT_MONTHS[match[2].lower()], match[3] - else: - # American English - match = LONG_MDY_PATTERN.search(string) - if match: - day, month, year = match[2], TEXT_MONTHS[match[1].lower()], match[3] - else: - return None + # multilingual day-month-year + American English patterns + match = LONG_TEXT_PATTERN.search(string) + if not match: + return None # process and return try: - int_day, int_month, int_year = int(day), int(month), int(year) - int_year = correct_year(int_year) - int_day, int_month = try_swap_values(int_day, int_month) - dateobject = datetime(int_year, int_month, int_day) + if match.lastgroup == "year": + day, month, year = ( + int(match.group("day")), + int(TEXT_MONTHS[match.group("month").lower()]), + int(match.group("year")), + ) + else: + day, month, year = ( + int(match.group("day2")), + int(TEXT_MONTHS[match.group("month2").lower()]), + int(match.group("year2")), + ) + year = correct_year(year) + day, month = try_swap_values(day, month) + dateobject = datetime(year, month, day) except ValueError: return None else: @@ -436,34 +436,29 @@ def custom_parse( LOGGER.debug("YYYYMMDD match: %s", candidate) return candidate.strftime(outputformat) - # 3. Try YMD and Y-M-D pattern since it's the one used in ISO-8601 + # 3. Try the very common YMD, Y-M-D, and D-M-Y patterns match = YMD_PATTERN.search(string) if match: try: - day, month, year = int(match[3]), int(match[2]), int(match[1]) - candidate = datetime(year, month, day) - except ValueError: - LOGGER.debug("Y-M-D value error: %s", match[0]) - else: - if ( - date_validator( - candidate, "%Y-%m-%d", earliest=min_date, latest=max_date + # YMD + if match.lastgroup == "day": + candidate = datetime( + int(match.group("year")), + int(match.group("month")), + int(match.group("day")), ) - is True - ): - LOGGER.debug("Y-M-D match: %s", candidate) - return candidate.strftime(outputformat) - - # 4. Try the D-M-Y pattern since it's the most common date format in the world - match = DMY_PATTERN.search(string) - if match: - try: - day, month, year = int(match[1]), int(match[2]), int(match[3]) - year = correct_year(year) - day, month = try_swap_values(day, month) - candidate = datetime(year, month, day) + # DMY + else: + day, month, year = ( + int(match.group("day2")), + int(match.group("month2")), + int(match.group("year2")), + ) + year = correct_year(year) + day, month = try_swap_values(day, month) + candidate = datetime(year, month, day) except ValueError: - LOGGER.debug("D-M-Y value error: %s", match[0]) + LOGGER.debug("regex value error: %s", match[0]) else: if ( date_validator( @@ -471,15 +466,21 @@ def custom_parse( ) is True ): - LOGGER.debug("D-M-Y match: %s", candidate) + LOGGER.debug("regex match: %s", candidate) return candidate.strftime(outputformat) - # 5. Try the Y-M pattern + # 4. Try the Y-M and M-Y patterns match = YM_PATTERN.search(string) if match: try: - year, month = int(match[1]), int(match[2]) - candidate = datetime(year, month, 1) + if match.lastgroup == "month": + candidate = datetime( + int(match.group("year")), int(match.group("month")), 1 + ) + else: + candidate = datetime( + int(match.group("year2")), int(match.group("month2")), 1 + ) except ValueError: LOGGER.debug("Y-M value error: %s", match[0]) else: @@ -492,7 +493,7 @@ def custom_parse( LOGGER.debug("Y-M match: %s", candidate) return candidate.strftime(outputformat) - # 6. Try the other regex pattern + # 5. Try the other regex pattern dateobject = regex_parse(string) if ( date_validator(dateobject, outputformat, earliest=min_date, latest=max_date)