Skip to content

Commit

Permalink
fix: properly set args for date validation (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Aug 25, 2022
1 parent fce6e86 commit 1aceb8a
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 16 deletions.
21 changes: 15 additions & 6 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,12 @@ def examine_header(
LOGGER.debug("examining meta itemprop: %s", logstring(elem))
if "content" in elem.attrib:
attempt = "-".join([elem.get("content"), "01", "01"])
if date_validator(attempt, "%Y-%m-%d", latest=max_date) is True:
if (
date_validator(
attempt, "%Y-%m-%d", earliest=min_date, latest=max_date
)
is True
):
reserve = attempt
# http-equiv, rare
elif "http-equiv" in elem.attrib:
Expand Down Expand Up @@ -636,7 +641,10 @@ def search_page(
)
if bestmatch is not None:
LOGGER.debug("Copyright detected: %s", bestmatch[0])
if date_validator(bestmatch[0], "%Y", latest=max_date) is True:
if (
date_validator(bestmatch[0], "%Y", earliest=min_date, latest=max_date)
is True
):
LOGGER.debug("copyright year/footer pattern found: %s", bestmatch[0])
copyear = int(bestmatch[0])

Expand Down Expand Up @@ -779,9 +787,9 @@ def search_page(
)
if bestmatch is not None:
pagedate = "-".join([bestmatch[1], bestmatch[2], "01"])
if date_validator(pagedate, "%Y-%m-%d", latest=max_date) is True and (
copyear == 0 or int(bestmatch[1]) >= copyear
):
if date_validator(
pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date
) is True and (copyear == 0 or int(bestmatch[1]) >= copyear):
LOGGER.debug('date found for pattern "%s": %s', YYYYMM_PATTERN, pagedate)
return convert_date(pagedate, "%Y-%m-%d", outputformat)

Expand Down Expand Up @@ -836,7 +844,8 @@ def search_page(
if bestmatch is not None:
pagedate = "-".join([bestmatch[0], "01", "01"])
if (
date_validator(pagedate, "%Y-%m-%d", latest=max_date) is True
date_validator(pagedate, "%Y-%m-%d", earliest=min_date, latest=max_date)
is True
and int(bestmatch[0]) >= copyear
):
LOGGER.debug('date found for pattern "%s": %s', SIMPLE_PATTERN, pagedate)
Expand Down
5 changes: 1 addition & 4 deletions htmldate/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@
MIN_FILE_SIZE: int = 10

# Plausible dates
# earliest possible year to take into account (inclusive)
# earliest possible date to take into account (inclusive)
MIN_DATE: datetime = datetime(1995, 1, 1)
MIN_YEAR: int = MIN_DATE.year
# latest possible date
LATEST_POSSIBLE: datetime = datetime.now()
# latest possible year
MAX_YEAR: int = LATEST_POSSIBLE.year

# set an upper limit to the number of candidates
MAX_POSSIBLE_CANDIDATES: int = 1000
Expand Down
14 changes: 8 additions & 6 deletions htmldate/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from time import mktime
from typing import Match, Optional, Pattern, Union, Counter as Counter_Type

from .settings import CACHE_SIZE, LATEST_POSSIBLE, MAX_YEAR, MIN_DATE, MIN_YEAR
from .settings import CACHE_SIZE, LATEST_POSSIBLE, MIN_DATE


LOGGER = logging.getLogger(__name__)
LOGGER.debug("date settings: %s %s %s", MIN_YEAR, LATEST_POSSIBLE, MAX_YEAR)
LOGGER.debug("date settings: %s %s", MIN_DATE, LATEST_POSSIBLE)


@lru_cache(maxsize=CACHE_SIZE)
Expand Down Expand Up @@ -50,7 +50,9 @@ def date_validator(
dateobject = date_input
# basic year validation
year = int(datetime.strftime(dateobject, "%Y"))
if MIN_YEAR <= year <= MAX_YEAR:
min_year, max_year = earliest.year, latest.year
# full validation
if min_year <= year <= max_year:
# not newer than today or stored variable
if earliest.timestamp() <= dateobject.timestamp() <= latest.timestamp():
return True
Expand Down Expand Up @@ -100,9 +102,9 @@ def plausible_year_filter(
potential_year = int("19" + lastdigits)
else:
potential_year = int("20" + lastdigits)
if potential_year < MIN_YEAR or potential_year > MAX_YEAR:
LOGGER.debug("no potential year: %s", item)
toremove.add(item)
# if potential_year < MIN_YEAR or potential_year > MAX_YEAR:
# LOGGER.debug("no potential year: %s", item)
# toremove.add(item)
# occurrences.remove(item)
# continue
else:
Expand Down
21 changes: 21 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,22 @@ def test_exact_date():
== "2021-07-13"
)

# min_date parameter
assert (
find_date(
'<html><meta><meta property="article:published_time" content="1991-01-02T01:01:00+01:00"></meta><body></body></html>',
min_date="2000-01-01",
)
is None
)
assert (
find_date(
'<html><meta><meta property="article:published_time" content="1991-01-02T01:01:00+01:00"></meta><body></body></html>',
min_date="1990-01-01",
)
== "1991-01-02"
)


def test_approximate_date():
"""this page should return an approximate date"""
Expand Down Expand Up @@ -1175,6 +1191,11 @@ def test_date_validator():
assert date_validator("202-01", OUTPUTFORMAT) is False
assert date_validator("1922", "%Y") is False
assert date_validator("2004", "%Y") is True
assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1)) is True
assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1992, 1, 1)) is False
assert date_validator("1991-01-02", OUTPUTFORMAT, latest=datetime.datetime(1990, 1, 1)) is False
assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1), latest=datetime.datetime(1995, 1, 1)) is True
assert date_validator("1991-01-02", OUTPUTFORMAT, earliest=datetime.datetime(1990, 1, 1), latest=datetime.datetime(1990, 12, 31)) is False


def test_convert_date():
Expand Down

0 comments on commit 1aceb8a

Please sign in to comment.