diff --git a/courlan/core.py b/courlan/core.py index 2e7b254..4b11515 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -5,7 +5,6 @@ # import locale import logging import re -import warnings from typing import List, Optional, Set, Tuple from urllib.robotparser import RobotFileParser @@ -136,8 +135,8 @@ def check_url( def extract_links( pagecontent: str, url: Optional[str] = None, - base_url: Optional[str] = None, external_bool: bool = False, + *, no_filter: bool = False, language: Optional[str] = None, strict: bool = True, @@ -145,6 +144,7 @@ def extract_links( with_nav: bool = False, redirects: bool = False, reference: Optional[str] = None, + base_url: Optional[str] = None, ) -> Set[str]: """Filter links in a HTML document using a series of heuristics Args: @@ -167,17 +167,17 @@ def extract_links( Nothing. """ if base_url: - warnings.warn( - "'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning - ) + raise ValueError("'base_url' is deprecated, use 'url' instead.") - base_url = base_url or get_base_url(url) + base_url = get_base_url(url) url = url or base_url candidates, validlinks = set(), set() # type: Set[str], Set[str] if not pagecontent: return validlinks + # define host reference reference = reference or base_url + # extract links for link in (m[0] for m in FIND_LINKS_REGEX.finditer(pagecontent)): if "rel" in link and "nofollow" in link: @@ -196,6 +196,7 @@ def extract_links( linkmatch = LINK_REGEX.search(link) if linkmatch: candidates.add(linkmatch[1]) + # filter candidates for link in candidates: # repair using base @@ -222,7 +223,7 @@ def extract_links( if is_known_link(link, validlinks): continue validlinks.add(link) - # return + LOGGER.info("%s links found – %s valid links", len(candidates), len(validlinks)) return validlinks @@ -230,16 +231,21 @@ def extract_links( def filter_links( htmlstring: str, url: Optional[str], - base_url: Optional[str] = None, + *, lang: Optional[str] = None, rules: Optional[RobotFileParser] = None, external: bool = False, strict: bool = False, with_nav: bool = True, + base_url: Optional[str] = None, ) -> Tuple[List[str], List[str]]: "Find links in a HTML document, filter and prioritize them for crawling purposes." + + if base_url: + raise ValueError("'base_url' is deprecated, use 'url' instead.") + links, links_priority = [], [] - url = url or base_url + for link in extract_links( pagecontent=htmlstring, url=url, @@ -258,4 +264,5 @@ def filter_links( links_priority.append(link) else: links.append(link) + return links, links_priority diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 1df56ee..39d18ca 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -862,7 +862,9 @@ def test_external(): def test_extraction(): """test link comparison in HTML""" - assert len(extract_links(None, "https://test.com/", False)) == 0 + with pytest.raises(ValueError): + extract_links(None, base_url="https://test.com/", external_bool=False) + assert len(extract_links(None, url="https://test.com/", external_bool=False)) == 0 assert len(extract_links("", "https://test.com/", False)) == 0 # link known under another form pagecontent = '' @@ -933,7 +935,7 @@ def test_extraction(): "https://httpbin.org/links/2/1", ] links = extract_links( - pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True + pagecontent, url="https://httpbin.org", external_bool=False, with_nav=True ) assert sorted(links) == [ "https://httpbin.org/links/2/0", @@ -1033,11 +1035,17 @@ def test_extraction(): "https://test.com/example", "https://test.com/page/2", ] + # link filtering base_url = "https://example.org" htmlstring = '
' - links, links_priority = filter_links(htmlstring, base_url) + + with pytest.raises(ValueError): + filter_links(htmlstring, url=None, base_url=base_url) + + links, links_priority = filter_links(htmlstring, url=base_url) assert len(links) == 1 and not links_priority + # link filtering with relative URLs url = "https://example.org/page1.html" htmlstring = ''