Skip to content

Commit

Permalink
deprecate base_url parameter in extract_links() and filter_links() (#121
Browse files Browse the repository at this point in the history
)

* deprecate base_url parameter in extract_links()

* add safeguards

* remove superfluous lines
  • Loading branch information
adbar authored Oct 22, 2024
1 parent ea02ae9 commit fbe1e6c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 12 deletions.
25 changes: 16 additions & 9 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# import locale
import logging
import re
import warnings

from typing import List, Optional, Set, Tuple
from urllib.robotparser import RobotFileParser
Expand Down Expand Up @@ -136,15 +135,16 @@ def check_url(
def extract_links(
pagecontent: str,
url: Optional[str] = None,
base_url: Optional[str] = None,
external_bool: bool = False,
*,
no_filter: bool = False,
language: Optional[str] = None,
strict: bool = True,
trailing_slash: bool = True,
with_nav: bool = False,
redirects: bool = False,
reference: Optional[str] = None,
base_url: Optional[str] = None,
) -> Set[str]:
"""Filter links in a HTML document using a series of heuristics
Args:
Expand All @@ -167,17 +167,17 @@ def extract_links(
Nothing.
"""
if base_url:
warnings.warn(
"'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning
)
raise ValueError("'base_url' is deprecated, use 'url' instead.")

base_url = base_url or get_base_url(url)
base_url = get_base_url(url)
url = url or base_url
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks

# define host reference
reference = reference or base_url

# extract links
for link in (m[0] for m in FIND_LINKS_REGEX.finditer(pagecontent)):
if "rel" in link and "nofollow" in link:
Expand All @@ -196,6 +196,7 @@ def extract_links(
linkmatch = LINK_REGEX.search(link)
if linkmatch:
candidates.add(linkmatch[1])

# filter candidates
for link in candidates:
# repair using base
Expand All @@ -222,24 +223,29 @@ def extract_links(
if is_known_link(link, validlinks):
continue
validlinks.add(link)
# return

LOGGER.info("%s links found – %s valid links", len(candidates), len(validlinks))
return validlinks


def filter_links(
htmlstring: str,
url: Optional[str],
base_url: Optional[str] = None,
*,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
strict: bool = False,
with_nav: bool = True,
base_url: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
"Find links in a HTML document, filter and prioritize them for crawling purposes."

if base_url:
raise ValueError("'base_url' is deprecated, use 'url' instead.")

links, links_priority = [], []
url = url or base_url

for link in extract_links(
pagecontent=htmlstring,
url=url,
Expand All @@ -258,4 +264,5 @@ def filter_links(
links_priority.append(link)
else:
links.append(link)

return links, links_priority
14 changes: 11 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,9 @@ def test_external():

def test_extraction():
"""test link comparison in HTML"""
assert len(extract_links(None, "https://test.com/", False)) == 0
with pytest.raises(ValueError):
extract_links(None, base_url="https://test.com/", external_bool=False)
assert len(extract_links(None, url="https://test.com/", external_bool=False)) == 0
assert len(extract_links("", "https://test.com/", False)) == 0
# link known under another form
pagecontent = '<html><a href="https://test.org/example"/><a href="https://test.org/example/&"/></html>'
Expand Down Expand Up @@ -933,7 +935,7 @@ def test_extraction():
"https://httpbin.org/links/2/1",
]
links = extract_links(
pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
pagecontent, url="https://httpbin.org", external_bool=False, with_nav=True
)
assert sorted(links) == [
"https://httpbin.org/links/2/0",
Expand Down Expand Up @@ -1033,11 +1035,17 @@ def test_extraction():
"https://test.com/example",
"https://test.com/page/2",
]

# link filtering
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, base_url)

with pytest.raises(ValueError):
filter_links(htmlstring, url=None, base_url=base_url)

links, links_priority = filter_links(htmlstring, url=base_url)
assert len(links) == 1 and not links_priority

# link filtering with relative URLs
url = "https://example.org/page1.html"
htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
Expand Down

0 comments on commit fbe1e6c

Please sign in to comment.