deprecate base_url parameter in extract_links() and filter_links() (#121

) * deprecate base_url parameter in extract_links() * add safeguards * remove superfluous lines
adbar · Oct 22, 2024 · fbe1e6c · fbe1e6c
1 parent ea02ae9
commit fbe1e6c
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 12 deletions.
diff --git a/courlan/core.py b/courlan/core.py
@@ -5,7 +5,6 @@
 # import locale
 import logging
 import re
-import warnings
 
 from typing import List, Optional, Set, Tuple
 from urllib.robotparser import RobotFileParser
@@ -136,15 +135,16 @@ def check_url(
 def extract_links(
     pagecontent: str,
     url: Optional[str] = None,
-    base_url: Optional[str] = None,
     external_bool: bool = False,
+    *,
     no_filter: bool = False,
     language: Optional[str] = None,
     strict: bool = True,
     trailing_slash: bool = True,
     with_nav: bool = False,
     redirects: bool = False,
     reference: Optional[str] = None,
+    base_url: Optional[str] = None,
 ) -> Set[str]:
     """Filter links in a HTML document using a series of heuristics
     Args:
@@ -167,17 +167,17 @@ def extract_links(
         Nothing.
     """
     if base_url:
-        warnings.warn(
-            "'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning
-        )
+        raise ValueError("'base_url' is deprecated, use 'url' instead.")
 
-    base_url = base_url or get_base_url(url)
+    base_url = get_base_url(url)
     url = url or base_url
     candidates, validlinks = set(), set()  # type: Set[str], Set[str]
     if not pagecontent:
         return validlinks
+
     # define host reference
     reference = reference or base_url
+
     # extract links
     for link in (m[0] for m in FIND_LINKS_REGEX.finditer(pagecontent)):
         if "rel" in link and "nofollow" in link:
@@ -196,6 +196,7 @@ def extract_links(
             linkmatch = LINK_REGEX.search(link)
             if linkmatch:
                 candidates.add(linkmatch[1])
+
     # filter candidates
     for link in candidates:
         # repair using base
@@ -222,24 +223,29 @@ def extract_links(
         if is_known_link(link, validlinks):
             continue
         validlinks.add(link)
-    # return
+
     LOGGER.info("%s links found – %s valid links", len(candidates), len(validlinks))
     return validlinks
 
 
 def filter_links(
     htmlstring: str,
     url: Optional[str],
-    base_url: Optional[str] = None,
+    *,
     lang: Optional[str] = None,
     rules: Optional[RobotFileParser] = None,
     external: bool = False,
     strict: bool = False,
     with_nav: bool = True,
+    base_url: Optional[str] = None,
 ) -> Tuple[List[str], List[str]]:
     "Find links in a HTML document, filter and prioritize them for crawling purposes."
+
+    if base_url:
+        raise ValueError("'base_url' is deprecated, use 'url' instead.")
+
     links, links_priority = [], []
-    url = url or base_url
+
     for link in extract_links(
         pagecontent=htmlstring,
         url=url,
@@ -258,4 +264,5 @@ def filter_links(
             links_priority.append(link)
         else:
             links.append(link)
+
     return links, links_priority
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -862,7 +862,9 @@ def test_external():
 
 def test_extraction():
     """test link comparison in HTML"""
-    assert len(extract_links(None, "https://test.com/", False)) == 0
+    with pytest.raises(ValueError):
+        extract_links(None, base_url="https://test.com/", external_bool=False)
+    assert len(extract_links(None, url="https://test.com/", external_bool=False)) == 0
     assert len(extract_links("", "https://test.com/", False)) == 0
     # link known under another form
     pagecontent = '<html><a href="https://test.org/example"/><a href="https://test.org/example/&"/></html>'
@@ -933,7 +935,7 @@ def test_extraction():
         "https://httpbin.org/links/2/1",
     ]
     links = extract_links(
-        pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
+        pagecontent, url="https://httpbin.org", external_bool=False, with_nav=True
     )
     assert sorted(links) == [
         "https://httpbin.org/links/2/0",
@@ -1033,11 +1035,17 @@ def test_extraction():
         "https://test.com/example",
         "https://test.com/page/2",
     ]
+
     # link filtering
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
-    links, links_priority = filter_links(htmlstring, base_url)
+
+    with pytest.raises(ValueError):
+        filter_links(htmlstring, url=None, base_url=base_url)
+
+    links, links_priority = filter_links(htmlstring, url=base_url)
     assert len(links) == 1 and not links_priority
+
     # link filtering with relative URLs
     url = "https://example.org/page1.html"
     htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'