diff --git a/README.md b/README.md index 7b40d78..2d64c5d 100644 --- a/README.md +++ b/README.md @@ -137,9 +137,8 @@ available in `lang_filter(url, language)`: ``` Define stricter restrictions on the expected content type with -`strict=True`. Also blocks certain platforms and pages types crawlers -should stay away from if they don't target them explicitly and other -black holes where machines get lost. +`strict=True`. This also blocks certain platforms and page types +where machines get lost. ``` python # strict filtering: blocked as it is a major platform @@ -158,6 +157,20 @@ black holes where machines get lost. ### Web crawling and URL handling +Link extraction and preprocessing: + +``` python +>>> from courlan import extract_links +>>> doc = '
Link' +>>> url = "https://example.org" +>>> extract_links(doc, url) +{'https://example.org/test/link.html'} +# other options: external_bool, no_filter, language, strict, redirects, ... +``` + +The `filter_links()` function provides additional filters for crawling purposes: +use of robots.txt rules and link priorization. See `courlan.core` for details. + Determine if a link leads to another host: ``` python @@ -215,6 +228,10 @@ True True ``` +See also [URL management page](https://trafilatura.readthedocs.io/en/latest/url-management.html) +of the Trafilatura documentation. + + ### Python helpers Helper function, scrub and normalize: diff --git a/courlan/core.py b/courlan/core.py index a06e3eb..2e7b254 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -5,6 +5,7 @@ # import locale import logging import re +import warnings from typing import List, Optional, Set, Tuple from urllib.robotparser import RobotFileParser @@ -149,7 +150,6 @@ def extract_links( Args: pagecontent: whole page in binary format url: full URL of the original page - base_url: deprecated, legacy only external_bool: set to True for external links only, False for internal links only no_filter: override settings and bypass checks to return all possible URLs @@ -157,7 +157,7 @@ def extract_links( strict: set to True for stricter filtering trailing_slash: set to False to trim trailing slashes with_nav: set to True to include navigation pages instead of discarding them - with_redirects: set to True for redirection test (per HTTP HEAD request) + redirects: set to True for redirection test (per HTTP HEAD request) reference: provide a host reference for external/internal evaluation Returns: @@ -166,6 +166,11 @@ def extract_links( Raises: Nothing. """ + if base_url: + warnings.warn( + "'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning + ) + base_url = base_url or get_base_url(url) url = url or base_url candidates, validlinks = set(), set() # type: Set[str], Set[str] @@ -232,7 +237,7 @@ def filter_links( strict: bool = False, with_nav: bool = True, ) -> Tuple[List[str], List[str]]: - "Find links in a HTML document, filter them and add them to the data store." + "Find links in a HTML document, filter and prioritize them for crawling purposes." links, links_priority = [], [] url = url or base_url for link in extract_links(