diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c55650788c..b3c2234bc6 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,13 +1,21 @@ version: 2 updates: -- package-ecosystem: cargo - directory: "/" - schedule: - interval: daily - open-pull-requests-limit: 10 + - package-ecosystem: cargo + directory: "/" + schedule: + interval: "weekly" + # Group dependency updates into a single pull request. + groups: + dependencies: + patterns: + - "*" -- package-ecosystem: github-actions - directory: / - schedule: - interval: daily - open-pull-requests-limit: 10 + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + # Group dependency updates into a single pull request. + groups: + dependencies: + patterns: + - "*" diff --git a/Cargo.lock b/Cargo.lock index bc16f194da..58b990ca63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -623,9 +623,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.21" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c27cdf28c0f604ba3f512b0c9a409f8de8513e4816705deb0498b627e7c3a3fd" +checksum = "03aef18ddf7d879c15ce20f04826ef8418101c7e528014c3eeea13321047dca3" dependencies = [ "clap_builder", "clap_derive", @@ -634,9 +634,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.21" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08a9f1ab5e9f01a9b81f202e8562eb9a10de70abf9eaeac1be465c28b75aa4aa" +checksum = "f8ce6fffb678c9b80a70b6b6de0aad31df727623a70fd9a842c30cd573e2fa98" dependencies = [ "anstream", "anstyle", @@ -825,7 +825,7 @@ dependencies = [ [[package]] name = "criterion" version = "0.5.1" -source = "git+https://github.com/bheisler/criterion.rs#c0461a6a7f42f86e8d6533ab082babe84e5c13b0" +source = "git+https://github.com/bheisler/criterion.rs#4c19e913b84e6a7e4a8470cb0f766796886ed891" dependencies = [ "anes", "cast", @@ -850,7 +850,7 @@ dependencies = [ [[package]] name = "criterion-plot" version = "0.5.0" -source = "git+https://github.com/bheisler/criterion.rs#c0461a6a7f42f86e8d6533ab082babe84e5c13b0" +source = "git+https://github.com/bheisler/criterion.rs#4c19e913b84e6a7e4a8470cb0f766796886ed891" dependencies = [ "cast", "itertools", @@ -1026,9 +1026,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.5.0" +version = "5.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" +checksum = "edd72493923899c6f10c641bdbdeddc7183d6396641d99c1a0d1597f37f92e28" dependencies = [ "cfg-if", "hashbrown 0.14.0", @@ -1924,7 +1924,7 @@ dependencies = [ "socket2 0.5.3", "widestring", "windows-sys 0.48.0", - "winreg 0.50.0", + "winreg", ] [[package]] @@ -3013,9 +3013,9 @@ checksum = "4bf2521270932c3c7bed1a59151222bd7643c79310f2916f01925e1e16255698" [[package]] name = "reqwest" -version = "0.11.18" +version = "0.11.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" +checksum = "20b9b67e2ca7dd9e9f9285b759de30ff538aab981abaaf7bc9bd90b84a0126c3" dependencies = [ "async-compression", "base64 0.21.2", @@ -3056,7 +3056,7 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "winreg 0.10.1", + "winreg", ] [[package]] @@ -3303,18 +3303,18 @@ checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" [[package]] name = "serde" -version = "1.0.183" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "be9b6f69f1dfd54c3b568ffa45c310d6973a5e5148fd40cf515acaf38cf5bc31" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "dc59dfdcbad1437773485e0367fea4b090a2e0a16d9ffc46af47764536a298ec" dependencies = [ "proc-macro2", "quote", @@ -3376,9 +3376,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1402f54f9a3b9e2efe71c1cea24e648acce55887983553eeb858cf3115acfd49" +checksum = "1ca3b16a3d82c4088f343b7480a93550b3eabe1a358569c2dfe38bbcead07237" dependencies = [ "base64 0.21.2", "chrono", @@ -3393,9 +3393,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9197f1ad0e3c173a0222d3c4404fb04c3afe87e962bcb327af73e8301fa203c7" +checksum = "2e6be15c453eb305019bfa438b1593c731f36a289a7853f7707ee29e870b3b3c" dependencies = [ "darling 0.20.3", "proc-macro2", @@ -3662,9 +3662,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.7.1" +version = "3.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" dependencies = [ "cfg-if", "fastrand 2.0.0", @@ -4538,15 +4538,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" -dependencies = [ - "winapi", -] - [[package]] name = "winreg" version = "0.50.0" diff --git a/examples/builder/Cargo.toml b/examples/builder/Cargo.toml index 16f03ff1eb..049b72ffde 100644 --- a/examples/builder/Cargo.toml +++ b/examples/builder/Cargo.toml @@ -12,7 +12,7 @@ lychee-lib = { path = "../../lychee-lib", version = "0.13.0", default-features = tokio = { version = "1.32.0", features = ["full"] } regex = "1.9.3" http = "0.2.9" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip"] } [features] email-check = ["lychee-lib/email-check"] diff --git a/examples/collect_links/Cargo.toml b/examples/collect_links/Cargo.toml index 6d82395e4c..8e6c83cb98 100644 --- a/examples/collect_links/Cargo.toml +++ b/examples/collect_links/Cargo.toml @@ -13,7 +13,7 @@ tokio = { version = "1.32.0", features = ["full"] } regex = "1.9.3" http = "0.2.9" tokio-stream = "0.1.14" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip"] } [features] email-check = ["lychee-lib/email-check"] diff --git a/fixtures/fragments/file.html b/fixtures/fragments/file.html new file mode 100644 index 0000000000..db98b3af58 --- /dev/null +++ b/fixtures/fragments/file.html @@ -0,0 +1,22 @@ + + + + + For Testing Fragments + + +
+

+ To start + + let's run away. + +

+
+
+

Word

+ back we go + doesn't exist +
+ + diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index 623e61fcee..fb3642ab5e 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -21,11 +21,11 @@ This is a test file for the fragment loader. Explicit fragment links are currently not supported. Therefore we put the test into a code block for now to prevent false positives. -``` - + [Link to explicit fragment](#explicit-fragment) -``` + +[To the html doc](file.html#a-word) ## Custom Fragments diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index fc3d0f1160..0c8249c639 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -21,11 +21,11 @@ lychee-lib = { path = "../lychee-lib", version = "0.13.0", default-features = fa anyhow = "1.0.75" assert-json-diff = "2.0.2" -clap = { version = "4.3.21", features = ["env", "derive"] } +clap = { version = "4.3.23", features = ["env", "derive"] } console = "0.15.7" const_format = "0.2.31" csv = "1.2.2" -dashmap = { version = "5.5.0", features = ["serde"] } +dashmap = { version = "5.5.1", features = ["serde"] } env_logger = "0.10.0" futures = "0.3.28" headers = "0.3.8" @@ -38,7 +38,7 @@ once_cell = "1.18.0" openssl-sys = { version = "0.9.91", optional = true } pad = "0.1.6" regex = "1.9.3" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "json"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip", "json"] } reqwest_cookie_store = "0.6.0" # Make build work on Apple Silicon. # See https://github.com/briansmith/ring/issues/1163 @@ -46,7 +46,7 @@ reqwest_cookie_store = "0.6.0" # https://github.com/Homebrew/homebrew-core/pull/70216 ring = "0.16.20" secrecy = { version = "0.8.0", features = ["serde"] } -serde = { version = "1.0.183", features = ["derive"] } +serde = { version = "1.0.185", features = ["derive"] } serde_json = "1.0.105" strum = { version = "0.25.0", features = ["derive"] } supports-color = "2.0.0" @@ -59,7 +59,7 @@ toml = "0.7.6" assert_cmd = "2.0.12" predicates = "3.0.3" pretty_assertions = "1.4.0" -tempfile = "3.7.1" +tempfile = "3.8.0" tracing-subscriber = { version = "0.3.17", default-features = false, features = ["fmt", "registry", "env-filter"] } uuid = { version = "1.4.1", features = ["v4"] } wiremock = "0.5.19" diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index efdc4dea7e..81478879b1 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -14,6 +14,7 @@ mod cli { use lychee_lib::{InputSource, ResponseBody}; use predicates::str::{contains, is_empty}; use pretty_assertions::assert_eq; + use regex::Regex; use serde::Serialize; use serde_json::Value; use tempfile::NamedTempFile; @@ -1229,18 +1230,33 @@ mod cli { #[test] fn test_suggests_url_alternatives() -> Result<()> { - let mut cmd = main_command(); - let input = fixtures_path().join("INTERNET_ARCHIVE.md"); - - cmd.arg("--suggest") - .arg(input) - .assert() - .failure() - .code(2) - .stdout(contains("Suggestions")) - .stdout(contains("http://web.archive.org/web/")); + for _ in 0..3 { + // This can be flaky. Try up to 3 times + let mut cmd = main_command(); + let input = fixtures_path().join("INTERNET_ARCHIVE.md"); + + cmd.arg("--no-progress").arg("--suggest").arg(input); + + // Run he command and check if the output contains the expected + // suggestions + let assert = cmd.assert(); + let output = assert.get_output(); + + // We're looking for a suggestion that + // - starts with http://web.archive.org/web/ + // - ends with google.com/jobs.html + let re = Regex::new(r"http://web\.archive\.org/web/.*google\.com/jobs\.html").unwrap(); + if re.is_match(&String::from_utf8_lossy(&output.stdout)) { + // Test passed + return Ok(()); + } else { + // Wait for a second before retrying + std::thread::sleep(std::time::Duration::from_secs(1)); + } + } - Ok(()) + // If we reached here, it means the test did not pass after multiple attempts + Err("Did not get the expected command output after multiple attempts.".into()) } #[tokio::test] @@ -1411,6 +1427,7 @@ mod cli { .arg(input) .assert() .failure() + .stderr(contains("fixtures/fragments/file1.md#fragment-1")) .stderr(contains("fixtures/fragments/file1.md#fragment-2")) .stderr(contains("fixtures/fragments/file2.md#custom-id")) .stderr(contains("fixtures/fragments/file1.md#missing-fragment")) @@ -1418,12 +1435,15 @@ mod cli { .stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment")) .stderr(contains("fixtures/fragments/file2.md#missing-fragment")) .stderr(contains("fixtures/fragments/empty_file#fragment")) + .stderr(contains("fixtures/fragments/file.html#a-word")) + .stderr(contains("fixtures/fragments/file.html#in-the-beginning")) + .stderr(contains("fixtures/fragments/file.html#in-the-end")) .stderr(contains( "fixtures/fragments/file1.md#kebab-case-fragment-1", )) - .stdout(contains("8 Total")) - .stdout(contains("6 OK")) - // 2 failures because of missing fragments - .stdout(contains("2 Errors")); + .stdout(contains("13 Total")) + .stdout(contains("10 OK")) + // 3 failures because of missing fragments + .stdout(contains("3 Errors")); } } diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 5a800c882d..4725033a11 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -35,7 +35,7 @@ pulldown-cmark = "0.9.3" regex = "1.9.3" # Use trust-dns to avoid lookup failures on high concurrency # https://github.com/seanmonstar/reqwest/issues/296 -reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "trust-dns", "cookies"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip", "trust-dns", "cookies"] } reqwest_cookie_store = "0.6.0" # Make build work on Apple Silicon. # See https://github.com/briansmith/ring/issues/1163 @@ -43,8 +43,8 @@ reqwest_cookie_store = "0.6.0" # https://github.com/Homebrew/homebrew-core/pull/70216 ring = "0.16.20" secrecy = "0.8.0" -serde = { version = "1.0.183", features = ["derive"] } -serde_with = "3.2.0" +serde = { version = "1.0.185", features = ["derive"] } +serde_with = "3.3.0" shellexpand = "3.1.0" thiserror = "1.0.47" tokio = { version = "1.32.0", features = ["full"] } @@ -57,7 +57,7 @@ features = ["runtime-tokio"] [dev-dependencies] doc-comment = "0.3.3" -tempfile = "3.7.1" +tempfile = "3.8.0" wiremock = "0.5.19" serde_json = "1.0.105" rstest = "0.18.1" diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index de55ab4665..9555beeb54 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use html5gum::{Emitter, Error, State, Tokenizer}; use super::{is_email_link, is_verbatim_elem, srcset}; @@ -7,6 +9,7 @@ use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; struct LinkExtractor { // note: what html5gum calls a tag, lychee calls an element links: Vec, + fragments: HashSet, current_string: Vec, current_element_name: Vec, current_element_is_closing: bool, @@ -26,9 +29,10 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str { } impl LinkExtractor { - pub(crate) const fn new(include_verbatim: bool) -> Self { + pub(crate) fn new(include_verbatim: bool) -> Self { LinkExtractor { links: Vec::new(), + fragments: HashSet::new(), current_string: Vec::new(), current_element_name: Vec::new(), current_element_is_closing: false, @@ -181,6 +185,10 @@ impl LinkExtractor { }; self.links.extend(new_urls); + + if attr == "id" { + self.fragments.insert(value.to_string()); + } } self.current_attribute_name.clear(); @@ -288,24 +296,44 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { assert!(tokenizer.next().is_none()); extractor.links } + +/// Extract fragments from id attributes within a HTML string. +pub(crate) fn extract_html_fragments(buf: &str) -> HashSet { + let mut extractor = LinkExtractor::new(true); + let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible(); + assert!(tokenizer.next().is_none()); + extractor.fragments +} + #[cfg(test)] mod tests { use super::*; const HTML_INPUT: &str = r#" - -

This is a paragraph with some inline https://example.com and a normal example

+ +

This is a paragraph with some inline https://example.com and a normal example

         Some random text
         https://foo.com and http://bar.com/some/path
         Something else
         example link inside pre
         
-

bold

+

bold

"#; + #[test] + fn test_extract_fragments() { + let expected = HashSet::from([ + "content".to_string(), + "inline-code".to_string(), + "emphasis".to_string(), + ]); + let actual = extract_html_fragments(HTML_INPUT); + assert_eq!(actual, expected); + } + #[test] fn test_skip_verbatim() { let expected = vec![RawUri { diff --git a/lychee-lib/src/extract/html/mod.rs b/lychee-lib/src/extract/html/mod.rs index ceafb66f48..ef02c06fc4 100644 --- a/lychee-lib/src/extract/html/mod.rs +++ b/lychee-lib/src/extract/html/mod.rs @@ -1,3 +1,4 @@ +//! Extract links and fragments from html documents pub(crate) mod html5ever; pub(crate) mod html5gum; mod srcset; diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 06e53d4654..4cb1fc3338 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,11 +1,11 @@ -//! Extract things from markdown documents +//! Extract links and fragments from markdown documents use std::collections::{HashMap, HashSet}; use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; -use super::html::html5gum::extract_html; +use super::html::html5gum::{extract_html, extract_html_fragments}; /// Extract unparsed URL strings from a Markdown string. pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { @@ -80,7 +80,13 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec HashSet { let mut in_heading = false; let mut heading = String::new(); @@ -112,6 +118,11 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet { }; } + // An HTML node + Event::Html(html) => { + out.extend(extract_html_fragments(&html)); + } + // Silently skip over other events _ => (), } @@ -158,10 +169,12 @@ mod tests { use super::*; const MD_INPUT: &str = r#" -# Test +# A Test Some link in text [here](https://foo.com) +## A test {#well-still-the-same-test} + Code: ```bash @@ -171,8 +184,22 @@ https://bar.com/123 or inline like `https://bar.org` for instance. [example](http://example.com) + +The End "#; + #[test] + fn test_extract_fragments() { + let expected = HashSet::from([ + "a-test".to_string(), + "a-test-1".to_string(), + "well-still-the-same-test".to_string(), + "the-end".to_string(), + ]); + let actual = extract_markdown_fragments(MD_INPUT); + assert_eq!(actual, expected); + } + #[test] fn test_skip_verbatim() { let expected = vec![ diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 3bf92c8588..d9675ecb3a 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -1,6 +1,6 @@ use crate::types::{uri::raw::RawUri, FileType, InputContent}; -mod html; +pub mod html; pub mod markdown; mod plaintext; diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs index b7c0a250ac..39e034e139 100644 --- a/lychee-lib/src/utils/fragment_checker.rs +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -4,7 +4,11 @@ use std::{ sync::Arc, }; -use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result}; +use crate::{ + extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments}, + types::FileType, + Result, +}; use tokio::{fs, sync::Mutex}; use url::Url; @@ -39,37 +43,28 @@ impl FragmentChecker { /// /// In all other cases, returns true. pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result { - match (FileType::from(path), url.fragment()) { - (FileType::Markdown, Some(fragment)) => { - let url_without_frag = Self::remove_fragment(url.clone()); - self.populate_cache_if_vacant(url_without_frag, path, fragment) - .await - } - _ => Ok(true), - } - } - - fn remove_fragment(mut url: Url) -> String { - url.set_fragment(None); - url.into() - } + let Some(fragment) = url.fragment() else { + return Ok(true) + }; + let url_without_frag = Self::remove_fragment(url.clone()); - /// Populates the fragment cache with the given URL if it - /// is not already in the cache. - async fn populate_cache_if_vacant( - &self, - url_without_frag: String, - path: &Path, - fragment: &str, - ) -> Result { - let mut fragment_cache = self.cache.lock().await; - match fragment_cache.entry(url_without_frag.clone()) { + let extractor = match FileType::from(path) { + FileType::Markdown => extract_markdown_fragments, + FileType::Html => extract_html_fragments, + FileType::Plaintext => return Ok(true), + }; + match self.cache.lock().await.entry(url_without_frag) { Entry::Vacant(entry) => { let content = fs::read_to_string(path).await?; - let file_frags = extract_markdown_fragments(&content); + let file_frags = extractor(&content); Ok(entry.insert(file_frags).contains(fragment)) } Entry::Occupied(entry) => Ok(entry.get().contains(fragment)), } } + + fn remove_fragment(mut url: Url) -> String { + url.set_fragment(None); + url.into() + } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index f092b72b1c..70f82ce1d8 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -131,7 +131,7 @@ fn construct_url(base: &Option, text: &str) -> Option> { } fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { - let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst); + let (dst, frag) = url::remove_get_params_and_separate_fragment(dst); // Avoid double-encoding already encoded destination paths by removing any // potential encoding (e.g. `web%20site` becomes `web site`). // That's because Url::from_file_path will encode the full URL in the end. diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 4eb40f76bb..c27f0a0202 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -4,9 +4,9 @@ use once_cell::sync::Lazy; static LINK_FINDER: Lazy = Lazy::new(LinkFinder::new); -/// Remove all GET parameters from a URL and seperates out the fragment. +/// Remove all GET parameters from a URL and separates out the fragment. /// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) { +pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Option<&str>) { let (path, frag) = match url.split_once('#') { Some((path, fragment)) => (path, Some(fragment)), None => (url, None), @@ -29,48 +29,48 @@ mod test_fs_tree { #[test] fn test_remove_get_params_and_fragment() { - assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None)); + assert_eq!(remove_get_params_and_separate_fragment("/"), ("/", None)); assert_eq!( - remove_get_params_and_seperate_fragment("index.html?foo=bar"), + remove_get_params_and_separate_fragment("index.html?foo=bar"), ("index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("/index.html?foo=bar"), + remove_get_params_and_separate_fragment("/index.html?foo=bar"), ("/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), + remove_get_params_and_separate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), ("/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"), + remove_get_params_and_separate_fragment("https://example.com/index.html?foo=bar"), ("https://example.com/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar"), + remove_get_params_and_separate_fragment("test.png?foo=bar"), ("test.png", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"), + remove_get_params_and_separate_fragment("https://example.com/index.html#anchor"), ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment( + remove_get_params_and_separate_fragment( "https://example.com/index.html?foo=bar#anchor" ), ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"), + remove_get_params_and_separate_fragment("test.png?foo=bar#anchor"), ("test.png", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"), + remove_get_params_and_separate_fragment("test.png#anchor?anchor!?"), ("test.png", Some("anchor?anchor!?")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"), + remove_get_params_and_separate_fragment("test.png?foo=bar#anchor?anchor!"), ("test.png", Some("anchor?anchor!")) ); } diff --git a/lychee.example.toml b/lychee.example.toml index a26d8a3946..ce6ca3b3fb 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -94,7 +94,10 @@ include_verbatim = false glob_ignore_case = false # Exclude URLs and mail addresses from checking (supports regex). -exclude = ['.*github\.com.*'] +exclude = [ + '^https://www\.linkedin\.com', + '^https://web\.archive\.org/web/', +] # Exclude these filesystem paths from getting checked. exclude_path = ["file/path/to/Ignore", "./other/file/path/to/Ignore"]