From 1aceb8a77a3c19d0d2f666808fdf482767f73afc Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 25 Aug 2022 16:45:04 +0200 Subject: [PATCH] fix: properly set args for date validation (#62) --- htmldate/core.py | 21 +++++++++++++++------ htmldate/settings.py | 5 +---- htmldate/validators.py | 14 ++++++++------ tests/unit_tests.py | 21 +++++++++++++++++++++ 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/htmldate/core.py b/htmldate/core.py index 493a1490..cc45b407 100644 --- a/htmldate/core.py +++ b/htmldate/core.py @@ -299,7 +299,12 @@ def examine_header( LOGGER.debug("examining meta itemprop: %s", logstring(elem)) if "content" in elem.attrib: attempt = "-".join([elem.get("content"), "01", "01"]) - if date_validator(attempt, "%Y-%m-%d", latest=max_date) is True: + if ( + date_validator( + attempt, "%Y-%m-%d", earliest=min_date, latest=max_date + ) + is True + ): reserve = attempt # http-equiv, rare elif "http-equiv" in elem.attrib: @@ -636,7 +641,10 @@ def search_page( ) if bestmatch is not None: LOGGER.debug("Copyright detected: %s", bestmatch[0]) - if date_validator(bestmatch[0], "%Y", latest=max_date) is True: + if ( + date_validator(bestmatch[0], "%Y", earliest=min_date, latest=max_date) + is True + ): LOGGER.debug("copyright year/footer pattern found: %s", bestmatch[0]) copyear = int(bestmatch[0]) @@ -779,9 +787,9 @@ def search_page( ) if bestmatch is not None: pagedate = "-".join([bestmatch[1], bestmatch[2], "01"]) - if date_validator(pagedate, "%Y-%m-%d", latest=max_date) is True and ( - copyear == 0 or int(bestmatch[1]) >= copyear - ): + if date_validator( + pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date + ) is True and (copyear == 0 or int(bestmatch[1]) >= copyear): LOGGER.debug('date found for pattern "%s": %s', YYYYMM_PATTERN, pagedate) return convert_date(pagedate, "%Y-%m-%d", outputformat) @@ -836,7 +844,8 @@ def search_page( if bestmatch is not None: pagedate = "-".join([bestmatch[0], "01", "01"]) if ( - date_validator(pagedate, "%Y-%m-%d", latest=max_date) is True + date_validator(pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date) + is True and int(bestmatch[0]) >= copyear ): LOGGER.debug('date found for pattern "%s": %s', SIMPLE_PATTERN, pagedate) diff --git a/htmldate/settings.py b/htmldate/settings.py index 2d7bdf08..4f379262 100644 --- a/htmldate/settings.py +++ b/htmldate/settings.py @@ -19,13 +19,10 @@ MIN_FILE_SIZE: int = 10 # Plausible dates -# earliest possible year to take into account (inclusive) +# earliest possible date to take into account (inclusive) MIN_DATE: datetime = datetime(1995, 1, 1) -MIN_YEAR: int = MIN_DATE.year # latest possible date LATEST_POSSIBLE: datetime = datetime.now() -# latest possible year -MAX_YEAR: int = LATEST_POSSIBLE.year # set an upper limit to the number of candidates MAX_POSSIBLE_CANDIDATES: int = 1000 diff --git a/htmldate/validators.py b/htmldate/validators.py index 78dcd2d7..b812d2d8 100644 --- a/htmldate/validators.py +++ b/htmldate/validators.py @@ -15,11 +15,11 @@ from time import mktime from typing import Match, Optional, Pattern, Union, Counter as Counter_Type -from .settings import CACHE_SIZE, LATEST_POSSIBLE, MAX_YEAR, MIN_DATE, MIN_YEAR +from .settings import CACHE_SIZE, LATEST_POSSIBLE, MIN_DATE LOGGER = logging.getLogger(__name__) -LOGGER.debug("date settings: %s %s %s", MIN_YEAR, LATEST_POSSIBLE, MAX_YEAR) +LOGGER.debug("date settings: %s %s", MIN_DATE, LATEST_POSSIBLE) @lru_cache(maxsize=CACHE_SIZE) @@ -50,7 +50,9 @@ def date_validator( dateobject = date_input # basic year validation year = int(datetime.strftime(dateobject, "%Y")) - if MIN_YEAR <= year <= MAX_YEAR: + min_year, max_year = earliest.year, latest.year + # full validation + if min_year <= year <= max_year: # not newer than today or stored variable if earliest.timestamp() <= dateobject.timestamp() <= latest.timestamp(): return True @@ -100,9 +102,9 @@ def plausible_year_filter( potential_year = int("19" + lastdigits) else: potential_year = int("20" + lastdigits) - if potential_year < MIN_YEAR or potential_year > MAX_YEAR: - LOGGER.debug("no potential year: %s", item) - toremove.add(item) + # if potential_year < MIN_YEAR or potential_year > MAX_YEAR: + # LOGGER.debug("no potential year: %s", item) + # toremove.add(item) # occurrences.remove(item) # continue else: diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 7c28dfb4..e2fb66fe 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1083,6 +1083,22 @@ def test_exact_date(): == "2021-07-13" ) + # min_date parameter + assert ( + find_date( + '', + min_date="2000-01-01", + ) + is None + ) + assert ( + find_date( + '', + min_date="1990-01-01", + ) + == "1991-01-02" + ) + def test_approximate_date(): """this page should return an approximate date""" @@ -1175,6 +1191,11 @@ def test_date_validator(): assert date_validator("202-01", OUTPUTFORMAT) is False assert date_validator("1922", "%Y") is False assert date_validator("2004", "%Y") is True + assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1)) is True + assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1992, 1, 1)) is False + assert date_validator("1991-01-02", OUTPUTFORMAT, latest=datetime.datetime(1990, 1, 1)) is False + assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1), latest=datetime.datetime(1995, 1, 1)) is True + assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1), latest=datetime.datetime(1990, 12, 31)) is False def test_convert_date():