From 006ee6d3befff8d81749efb0d447f0b6d2d4e18d Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Thu, 17 Aug 2023 16:54:59 +0200 Subject: [PATCH 01/13] Make suggestion test more robust (#1229) --- lychee-bin/tests/cli.rs | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index efdc4dea7e..66a39bdd9e 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -14,6 +14,7 @@ mod cli { use lychee_lib::{InputSource, ResponseBody}; use predicates::str::{contains, is_empty}; use pretty_assertions::assert_eq; + use regex::Regex; use serde::Serialize; use serde_json::Value; use tempfile::NamedTempFile; @@ -1229,18 +1230,33 @@ mod cli { #[test] fn test_suggests_url_alternatives() -> Result<()> { - let mut cmd = main_command(); - let input = fixtures_path().join("INTERNET_ARCHIVE.md"); - - cmd.arg("--suggest") - .arg(input) - .assert() - .failure() - .code(2) - .stdout(contains("Suggestions")) - .stdout(contains("http://web.archive.org/web/")); + for _ in 0..3 { + // This can be flaky. Try up to 3 times + let mut cmd = main_command(); + let input = fixtures_path().join("INTERNET_ARCHIVE.md"); + + cmd.arg("--no-progress").arg("--suggest").arg(input); + + // Run he command and check if the output contains the expected + // suggestions + let assert = cmd.assert(); + let output = assert.get_output(); + + // We're looking for a suggestion that + // - starts with http://web.archive.org/web/ + // - ends with google.com/jobs.html + let re = Regex::new(r"http://web\.archive\.org/web/.*google\.com/jobs\.html").unwrap(); + if re.is_match(&String::from_utf8_lossy(&output.stdout)) { + // Test passed + return Ok(()); + } else { + // Wait for a second before retrying + std::thread::sleep(std::time::Duration::from_secs(1)); + } + } - Ok(()) + // If we reached here, it means the test did not pass after multiple attempts + Err("Did not get the expected command output after multiple attempts.".into()) } #[tokio::test] From ad9c9a367348770e1c2c5d3dfa69744eaeca52ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 18 Aug 2023 12:25:08 +0000 Subject: [PATCH 02/13] Bump clap from 4.3.21 to 4.3.22 Bumps [clap](https://github.com/clap-rs/clap) from 4.3.21 to 4.3.22. - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.3.21...v4.3.22) --- updated-dependencies: - dependency-name: clap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 8 ++++---- lychee-bin/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c9a423aac1..4ce6573555 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -623,9 +623,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.21" +version = "4.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c27cdf28c0f604ba3f512b0c9a409f8de8513e4816705deb0498b627e7c3a3fd" +checksum = "b417ae4361bca3f5de378294fc7472d3c4ed86a5ef9f49e93ae722f432aae8d2" dependencies = [ "clap_builder", "clap_derive", @@ -634,9 +634,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.21" +version = "4.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08a9f1ab5e9f01a9b81f202e8562eb9a10de70abf9eaeac1be465c28b75aa4aa" +checksum = "9c90dc0f0e42c64bff177ca9d7be6fcc9ddb0f26a6e062174a61c84dd6c644d4" dependencies = [ "anstream", "anstyle", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index fc3d0f1160..539dd884c5 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -21,7 +21,7 @@ lychee-lib = { path = "../lychee-lib", version = "0.13.0", default-features = fa anyhow = "1.0.75" assert-json-diff = "2.0.2" -clap = { version = "4.3.21", features = ["env", "derive"] } +clap = { version = "4.3.22", features = ["env", "derive"] } console = "0.15.7" const_format = "0.2.31" csv = "1.2.2" From 07466c0bfd10669b5d97b57b7d5118417b3cf3d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Sat, 19 Aug 2023 14:49:36 +0200 Subject: [PATCH 03/13] Fix typos (#1231) --- lychee-lib/src/utils/request.rs | 2 +- lychee-lib/src/utils/url.rs | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index f092b72b1c..70f82ce1d8 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -131,7 +131,7 @@ fn construct_url(base: &Option, text: &str) -> Option> { } fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { - let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst); + let (dst, frag) = url::remove_get_params_and_separate_fragment(dst); // Avoid double-encoding already encoded destination paths by removing any // potential encoding (e.g. `web%20site` becomes `web site`). // That's because Url::from_file_path will encode the full URL in the end. diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 4eb40f76bb..c27f0a0202 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -4,9 +4,9 @@ use once_cell::sync::Lazy; static LINK_FINDER: Lazy = Lazy::new(LinkFinder::new); -/// Remove all GET parameters from a URL and seperates out the fragment. +/// Remove all GET parameters from a URL and separates out the fragment. /// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) { +pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Option<&str>) { let (path, frag) = match url.split_once('#') { Some((path, fragment)) => (path, Some(fragment)), None => (url, None), @@ -29,48 +29,48 @@ mod test_fs_tree { #[test] fn test_remove_get_params_and_fragment() { - assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None)); + assert_eq!(remove_get_params_and_separate_fragment("/"), ("/", None)); assert_eq!( - remove_get_params_and_seperate_fragment("index.html?foo=bar"), + remove_get_params_and_separate_fragment("index.html?foo=bar"), ("index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("/index.html?foo=bar"), + remove_get_params_and_separate_fragment("/index.html?foo=bar"), ("/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), + remove_get_params_and_separate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), ("/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"), + remove_get_params_and_separate_fragment("https://example.com/index.html?foo=bar"), ("https://example.com/index.html", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar"), + remove_get_params_and_separate_fragment("test.png?foo=bar"), ("test.png", None) ); assert_eq!( - remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"), + remove_get_params_and_separate_fragment("https://example.com/index.html#anchor"), ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment( + remove_get_params_and_separate_fragment( "https://example.com/index.html?foo=bar#anchor" ), ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"), + remove_get_params_and_separate_fragment("test.png?foo=bar#anchor"), ("test.png", Some("anchor")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"), + remove_get_params_and_separate_fragment("test.png#anchor?anchor!?"), ("test.png", Some("anchor?anchor!?")) ); assert_eq!( - remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"), + remove_get_params_and_separate_fragment("test.png?foo=bar#anchor?anchor!"), ("test.png", Some("anchor?anchor!")) ); } From 31a71162defba5771c34d318273eb9460ae4da6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Mon, 21 Aug 2023 00:06:53 +0200 Subject: [PATCH 04/13] Add sensible excluded URL-s to lychee.example.toml (#1234) --- lychee.example.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lychee.example.toml b/lychee.example.toml index 9639eba31c..9c23c8ab84 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -88,7 +88,10 @@ include_verbatim = false glob_ignore_case = false # Exclude URLs and mail addresses from checking (supports regex). -exclude = [ '.*github\.com.*' ] +exclude = [ + '^https://www\.linkedin\.com', + '^https://web\.archive\.org/web/', +] # Exclude these filesystem paths from getting checked. exclude_path = ["file/path/to/Ignore", "./other/file/path/to/Ignore"] From c36a58cd76d58e1c4ce6e6e232eb412dfcbadc95 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:41:01 +0000 Subject: [PATCH 05/13] Bump serde from 1.0.183 to 1.0.185 Bumps [serde](https://github.com/serde-rs/serde) from 1.0.183 to 1.0.185. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.183...v1.0.185) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 8 ++++---- lychee-bin/Cargo.toml | 2 +- lychee-lib/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4ce6573555..c38d6e7f54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3251,18 +3251,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.183" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "be9b6f69f1dfd54c3b568ffa45c310d6973a5e5148fd40cf515acaf38cf5bc31" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "dc59dfdcbad1437773485e0367fea4b090a2e0a16d9ffc46af47764536a298ec" dependencies = [ "proc-macro2", "quote", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 539dd884c5..683926e447 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -46,7 +46,7 @@ reqwest_cookie_store = "0.6.0" # https://github.com/Homebrew/homebrew-core/pull/70216 ring = "0.16.20" secrecy = { version = "0.8.0", features = ["serde"] } -serde = { version = "1.0.183", features = ["derive"] } +serde = { version = "1.0.185", features = ["derive"] } serde_json = "1.0.105" strum = { version = "0.25.0", features = ["derive"] } supports-color = "2.0.0" diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index e9a66130c4..f6e4bc6487 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -49,7 +49,7 @@ reqwest_cookie_store = "0.6.0" # https://github.com/Homebrew/homebrew-core/pull/70216 ring = "0.16.20" secrecy = "0.8.0" -serde = { version = "1.0.183", features = ["derive"] } +serde = { version = "1.0.185", features = ["derive"] } serde_with = "3.2.0" shellexpand = "3.1.0" thiserror = "1.0.47" From c965e51cb90c806d406d67f62f2e77f37cabbf51 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 18:42:36 +0200 Subject: [PATCH 06/13] Bump serde_with from 3.2.0 to 3.3.0 (#1237) Bumps [serde_with](https://github.com/jonasbb/serde_with) from 3.2.0 to 3.3.0. - [Release notes](https://github.com/jonasbb/serde_with/releases) - [Commits](https://github.com/jonasbb/serde_with/compare/v3.2.0...v3.3.0) --- updated-dependencies: - dependency-name: serde_with dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- lychee-lib/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c38d6e7f54..15dbbd1382 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3324,9 +3324,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1402f54f9a3b9e2efe71c1cea24e648acce55887983553eeb858cf3115acfd49" +checksum = "1ca3b16a3d82c4088f343b7480a93550b3eabe1a358569c2dfe38bbcead07237" dependencies = [ "base64 0.21.2", "chrono", @@ -3341,9 +3341,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9197f1ad0e3c173a0222d3c4404fb04c3afe87e962bcb327af73e8301fa203c7" +checksum = "2e6be15c453eb305019bfa438b1593c731f36a289a7853f7707ee29e870b3b3c" dependencies = [ "darling 0.20.3", "proc-macro2", diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index f6e4bc6487..f0a124b607 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -50,7 +50,7 @@ reqwest_cookie_store = "0.6.0" ring = "0.16.20" secrecy = "0.8.0" serde = { version = "1.0.185", features = ["derive"] } -serde_with = "3.2.0" +serde_with = "3.3.0" shellexpand = "3.1.0" thiserror = "1.0.47" tokio = { version = "1.32.0", features = ["full"] } From 403dd6efba2fdc8e6af362f9442f4e0d07b29aae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 18:42:51 +0200 Subject: [PATCH 07/13] Bump clap from 4.3.22 to 4.3.23 (#1236) Bumps [clap](https://github.com/clap-rs/clap) from 4.3.22 to 4.3.23. - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.3.22...v4.3.23) --- updated-dependencies: - dependency-name: clap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- lychee-bin/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 15dbbd1382..925d6588b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -623,9 +623,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.22" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b417ae4361bca3f5de378294fc7472d3c4ed86a5ef9f49e93ae722f432aae8d2" +checksum = "03aef18ddf7d879c15ce20f04826ef8418101c7e528014c3eeea13321047dca3" dependencies = [ "clap_builder", "clap_derive", @@ -634,9 +634,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.22" +version = "4.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c90dc0f0e42c64bff177ca9d7be6fcc9ddb0f26a6e062174a61c84dd6c644d4" +checksum = "f8ce6fffb678c9b80a70b6b6de0aad31df727623a70fd9a842c30cd573e2fa98" dependencies = [ "anstream", "anstyle", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 683926e447..50cfe4c872 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -21,7 +21,7 @@ lychee-lib = { path = "../lychee-lib", version = "0.13.0", default-features = fa anyhow = "1.0.75" assert-json-diff = "2.0.2" -clap = { version = "4.3.22", features = ["env", "derive"] } +clap = { version = "4.3.23", features = ["env", "derive"] } console = "0.15.7" const_format = "0.2.31" csv = "1.2.2" From 9f6f5501fa0dc79aece4b50ea8984a78f84c9fe8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 18:43:12 +0200 Subject: [PATCH 08/13] Bump tempfile from 3.7.1 to 3.8.0 (#1238) Bumps [tempfile](https://github.com/Stebalien/tempfile) from 3.7.1 to 3.8.0. - [Changelog](https://github.com/Stebalien/tempfile/blob/master/CHANGELOG.md) - [Commits](https://github.com/Stebalien/tempfile/compare/v3.7.1...v3.8.0) --- updated-dependencies: - dependency-name: tempfile dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- lychee-bin/Cargo.toml | 2 +- lychee-lib/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 925d6588b5..0daf7f26ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3610,9 +3610,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.7.1" +version = "3.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" dependencies = [ "cfg-if", "fastrand 2.0.0", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 50cfe4c872..998c9505c1 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -59,7 +59,7 @@ toml = "0.7.6" assert_cmd = "2.0.12" predicates = "3.0.3" pretty_assertions = "1.4.0" -tempfile = "3.7.1" +tempfile = "3.8.0" tracing-subscriber = { version = "0.3.17", default-features = false, features = ["fmt", "registry", "env-filter"] } uuid = { version = "1.4.1", features = ["v4"] } wiremock = "0.5.19" diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index f0a124b607..09a816d66a 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -63,7 +63,7 @@ features = ["runtime-tokio"] [dev-dependencies] doc-comment = "0.3.3" -tempfile = "3.7.1" +tempfile = "3.8.0" wiremock = "0.5.19" serde_json = "1.0.105" From f59aa61ee3337f938e7871a6f077da09c0355f07 Mon Sep 17 00:00:00 2001 From: Hugo McNally <45573837+HU90m@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:44:45 +0100 Subject: [PATCH 09/13] Check fragments in HTML files (#1198) * Added html5gum based fragment extractor * Markdown fragment extractor now extracts fragments from inline html * Added fragment checks for html file * Added inline html and html document to fragment checks test * Improved some comments * Improved documentation of markdown's fragment extractor. --- fixtures/fragments/file.html | 22 +++++++++++ fixtures/fragments/file1.md | 6 +-- lychee-bin/tests/cli.rs | 12 ++++-- lychee-lib/src/extract/html/html5gum.rs | 36 ++++++++++++++++-- lychee-lib/src/extract/html/mod.rs | 1 + lychee-lib/src/extract/markdown.rs | 35 ++++++++++++++++-- lychee-lib/src/extract/mod.rs | 2 +- lychee-lib/src/utils/fragment_checker.rs | 47 +++++++++++------------- 8 files changed, 119 insertions(+), 42 deletions(-) create mode 100644 fixtures/fragments/file.html diff --git a/fixtures/fragments/file.html b/fixtures/fragments/file.html new file mode 100644 index 0000000000..db98b3af58 --- /dev/null +++ b/fixtures/fragments/file.html @@ -0,0 +1,22 @@ + + + + + For Testing Fragments + + +
+

+ To start + + let's run away. + +

+
+
+

Word

+ back we go + doesn't exist +
+ + diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index 623e61fcee..fb3642ab5e 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -21,11 +21,11 @@ This is a test file for the fragment loader. Explicit fragment links are currently not supported. Therefore we put the test into a code block for now to prevent false positives. -``` - + [Link to explicit fragment](#explicit-fragment) -``` + +[To the html doc](file.html#a-word) ## Custom Fragments diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 66a39bdd9e..81478879b1 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1427,6 +1427,7 @@ mod cli { .arg(input) .assert() .failure() + .stderr(contains("fixtures/fragments/file1.md#fragment-1")) .stderr(contains("fixtures/fragments/file1.md#fragment-2")) .stderr(contains("fixtures/fragments/file2.md#custom-id")) .stderr(contains("fixtures/fragments/file1.md#missing-fragment")) @@ -1434,12 +1435,15 @@ mod cli { .stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment")) .stderr(contains("fixtures/fragments/file2.md#missing-fragment")) .stderr(contains("fixtures/fragments/empty_file#fragment")) + .stderr(contains("fixtures/fragments/file.html#a-word")) + .stderr(contains("fixtures/fragments/file.html#in-the-beginning")) + .stderr(contains("fixtures/fragments/file.html#in-the-end")) .stderr(contains( "fixtures/fragments/file1.md#kebab-case-fragment-1", )) - .stdout(contains("8 Total")) - .stdout(contains("6 OK")) - // 2 failures because of missing fragments - .stdout(contains("2 Errors")); + .stdout(contains("13 Total")) + .stdout(contains("10 OK")) + // 3 failures because of missing fragments + .stdout(contains("3 Errors")); } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index de55ab4665..9555beeb54 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use html5gum::{Emitter, Error, State, Tokenizer}; use super::{is_email_link, is_verbatim_elem, srcset}; @@ -7,6 +9,7 @@ use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; struct LinkExtractor { // note: what html5gum calls a tag, lychee calls an element links: Vec, + fragments: HashSet, current_string: Vec, current_element_name: Vec, current_element_is_closing: bool, @@ -26,9 +29,10 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str { } impl LinkExtractor { - pub(crate) const fn new(include_verbatim: bool) -> Self { + pub(crate) fn new(include_verbatim: bool) -> Self { LinkExtractor { links: Vec::new(), + fragments: HashSet::new(), current_string: Vec::new(), current_element_name: Vec::new(), current_element_is_closing: false, @@ -181,6 +185,10 @@ impl LinkExtractor { }; self.links.extend(new_urls); + + if attr == "id" { + self.fragments.insert(value.to_string()); + } } self.current_attribute_name.clear(); @@ -288,24 +296,44 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { assert!(tokenizer.next().is_none()); extractor.links } + +/// Extract fragments from id attributes within a HTML string. +pub(crate) fn extract_html_fragments(buf: &str) -> HashSet { + let mut extractor = LinkExtractor::new(true); + let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible(); + assert!(tokenizer.next().is_none()); + extractor.fragments +} + #[cfg(test)] mod tests { use super::*; const HTML_INPUT: &str = r#" - -

This is a paragraph with some inline https://example.com and a normal example

+ +

This is a paragraph with some inline https://example.com and a normal example

         Some random text
         https://foo.com and http://bar.com/some/path
         Something else
         example link inside pre
         
-

bold

+

bold

"#; + #[test] + fn test_extract_fragments() { + let expected = HashSet::from([ + "content".to_string(), + "inline-code".to_string(), + "emphasis".to_string(), + ]); + let actual = extract_html_fragments(HTML_INPUT); + assert_eq!(actual, expected); + } + #[test] fn test_skip_verbatim() { let expected = vec![RawUri { diff --git a/lychee-lib/src/extract/html/mod.rs b/lychee-lib/src/extract/html/mod.rs index ceafb66f48..ef02c06fc4 100644 --- a/lychee-lib/src/extract/html/mod.rs +++ b/lychee-lib/src/extract/html/mod.rs @@ -1,3 +1,4 @@ +//! Extract links and fragments from html documents pub(crate) mod html5ever; pub(crate) mod html5gum; mod srcset; diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 06e53d4654..4cb1fc3338 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,11 +1,11 @@ -//! Extract things from markdown documents +//! Extract links and fragments from markdown documents use std::collections::{HashMap, HashSet}; use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; -use super::html::html5gum::extract_html; +use super::html::html5gum::{extract_html, extract_html_fragments}; /// Extract unparsed URL strings from a Markdown string. pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { @@ -80,7 +80,13 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec HashSet { let mut in_heading = false; let mut heading = String::new(); @@ -112,6 +118,11 @@ pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet { }; } + // An HTML node + Event::Html(html) => { + out.extend(extract_html_fragments(&html)); + } + // Silently skip over other events _ => (), } @@ -158,10 +169,12 @@ mod tests { use super::*; const MD_INPUT: &str = r#" -# Test +# A Test Some link in text [here](https://foo.com) +## A test {#well-still-the-same-test} + Code: ```bash @@ -171,8 +184,22 @@ https://bar.com/123 or inline like `https://bar.org` for instance. [example](http://example.com) + +The End "#; + #[test] + fn test_extract_fragments() { + let expected = HashSet::from([ + "a-test".to_string(), + "a-test-1".to_string(), + "well-still-the-same-test".to_string(), + "the-end".to_string(), + ]); + let actual = extract_markdown_fragments(MD_INPUT); + assert_eq!(actual, expected); + } + #[test] fn test_skip_verbatim() { let expected = vec![ diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 3bf92c8588..d9675ecb3a 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -1,6 +1,6 @@ use crate::types::{uri::raw::RawUri, FileType, InputContent}; -mod html; +pub mod html; pub mod markdown; mod plaintext; diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs index b7c0a250ac..39e034e139 100644 --- a/lychee-lib/src/utils/fragment_checker.rs +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -4,7 +4,11 @@ use std::{ sync::Arc, }; -use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result}; +use crate::{ + extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments}, + types::FileType, + Result, +}; use tokio::{fs, sync::Mutex}; use url::Url; @@ -39,37 +43,28 @@ impl FragmentChecker { /// /// In all other cases, returns true. pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result { - match (FileType::from(path), url.fragment()) { - (FileType::Markdown, Some(fragment)) => { - let url_without_frag = Self::remove_fragment(url.clone()); - self.populate_cache_if_vacant(url_without_frag, path, fragment) - .await - } - _ => Ok(true), - } - } - - fn remove_fragment(mut url: Url) -> String { - url.set_fragment(None); - url.into() - } + let Some(fragment) = url.fragment() else { + return Ok(true) + }; + let url_without_frag = Self::remove_fragment(url.clone()); - /// Populates the fragment cache with the given URL if it - /// is not already in the cache. - async fn populate_cache_if_vacant( - &self, - url_without_frag: String, - path: &Path, - fragment: &str, - ) -> Result { - let mut fragment_cache = self.cache.lock().await; - match fragment_cache.entry(url_without_frag.clone()) { + let extractor = match FileType::from(path) { + FileType::Markdown => extract_markdown_fragments, + FileType::Html => extract_html_fragments, + FileType::Plaintext => return Ok(true), + }; + match self.cache.lock().await.entry(url_without_frag) { Entry::Vacant(entry) => { let content = fs::read_to_string(path).await?; - let file_frags = extract_markdown_fragments(&content); + let file_frags = extractor(&content); Ok(entry.insert(file_frags).contains(fragment)) } Entry::Occupied(entry) => Ok(entry.get().contains(fragment)), } } + + fn remove_fragment(mut url: Url) -> String { + url.set_fragment(None); + url.into() + } } From 6bbce8780aed05367a0bb94d2437a76c7276bd29 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Aug 2023 16:45:05 +0200 Subject: [PATCH 10/13] Bump criterion from `c0461a6` to `4c19e91` (#1223) Bumps [criterion](https://github.com/bheisler/criterion.rs) from `c0461a6` to `4c19e91`. - [Commits](https://github.com/bheisler/criterion.rs/compare/c0461a6a7f42f86e8d6533ab082babe84e5c13b0...4c19e913b84e6a7e4a8470cb0f766796886ed891) --- updated-dependencies: - dependency-name: criterion dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0daf7f26ff..916fd8ab01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -825,7 +825,7 @@ dependencies = [ [[package]] name = "criterion" version = "0.5.1" -source = "git+https://github.com/bheisler/criterion.rs#c0461a6a7f42f86e8d6533ab082babe84e5c13b0" +source = "git+https://github.com/bheisler/criterion.rs#4c19e913b84e6a7e4a8470cb0f766796886ed891" dependencies = [ "anes", "cast", @@ -850,7 +850,7 @@ dependencies = [ [[package]] name = "criterion-plot" version = "0.5.0" -source = "git+https://github.com/bheisler/criterion.rs#c0461a6a7f42f86e8d6533ab082babe84e5c13b0" +source = "git+https://github.com/bheisler/criterion.rs#4c19e913b84e6a7e4a8470cb0f766796886ed891" dependencies = [ "cast", "itertools", From f4383d2db5639b314f4c39cbb6de2dab595f877a Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Tue, 22 Aug 2023 16:50:37 +0200 Subject: [PATCH 11/13] Group Dependabot updates into a single pull request --- .github/dependabot.yml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index c55650788c..b3c2234bc6 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,13 +1,21 @@ version: 2 updates: -- package-ecosystem: cargo - directory: "/" - schedule: - interval: daily - open-pull-requests-limit: 10 + - package-ecosystem: cargo + directory: "/" + schedule: + interval: "weekly" + # Group dependency updates into a single pull request. + groups: + dependencies: + patterns: + - "*" -- package-ecosystem: github-actions - directory: / - schedule: - interval: daily - open-pull-requests-limit: 10 + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + # Group dependency updates into a single pull request. + groups: + dependencies: + patterns: + - "*" From 88f0e8f8bfc0f8a7b20285c267dbb3ca2dbc97a9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Aug 2023 16:51:28 +0200 Subject: [PATCH 12/13] Bump dashmap from 5.5.0 to 5.5.1 (#1241) Bumps [dashmap](https://github.com/xacrimon/dashmap) from 5.5.0 to 5.5.1. - [Release notes](https://github.com/xacrimon/dashmap/releases) - [Commits](https://github.com/xacrimon/dashmap/compare/v5.5.0...v5.5.1) --- updated-dependencies: - dependency-name: dashmap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- lychee-bin/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 916fd8ab01..dbeddd5910 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1026,9 +1026,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.5.0" +version = "5.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" +checksum = "edd72493923899c6f10c641bdbdeddc7183d6396641d99c1a0d1597f37f92e28" dependencies = [ "cfg-if", "hashbrown 0.14.0", diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 998c9505c1..2968408581 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -25,7 +25,7 @@ clap = { version = "4.3.23", features = ["env", "derive"] } console = "0.15.7" const_format = "0.2.31" csv = "1.2.2" -dashmap = { version = "5.5.0", features = ["serde"] } +dashmap = { version = "5.5.1", features = ["serde"] } env_logger = "0.10.0" futures = "0.3.28" headers = "0.3.8" From 007650faf035e521848bc97e5f71556f2eb3891d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Aug 2023 16:51:58 +0200 Subject: [PATCH 13/13] Bump reqwest from 0.11.18 to 0.11.19 (#1240) Bumps [reqwest](https://github.com/seanmonstar/reqwest) from 0.11.18 to 0.11.19. - [Release notes](https://github.com/seanmonstar/reqwest/releases) - [Changelog](https://github.com/seanmonstar/reqwest/blob/master/CHANGELOG.md) - [Commits](https://github.com/seanmonstar/reqwest/compare/v0.11.18...v0.11.19) --- updated-dependencies: - dependency-name: reqwest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 17 ++++------------- examples/builder/Cargo.toml | 2 +- examples/collect_links/Cargo.toml | 2 +- lychee-bin/Cargo.toml | 2 +- lychee-lib/Cargo.toml | 2 +- 5 files changed, 8 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dbeddd5910..b8d9993781 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1924,7 +1924,7 @@ dependencies = [ "socket2 0.5.3", "widestring", "windows-sys 0.48.0", - "winreg 0.50.0", + "winreg", ] [[package]] @@ -3005,9 +3005,9 @@ checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" [[package]] name = "reqwest" -version = "0.11.18" +version = "0.11.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" +checksum = "20b9b67e2ca7dd9e9f9285b759de30ff538aab981abaaf7bc9bd90b84a0126c3" dependencies = [ "async-compression", "base64 0.21.2", @@ -3048,7 +3048,7 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "winreg 0.10.1", + "winreg", ] [[package]] @@ -4486,15 +4486,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" -dependencies = [ - "winapi", -] - [[package]] name = "winreg" version = "0.50.0" diff --git a/examples/builder/Cargo.toml b/examples/builder/Cargo.toml index 16f03ff1eb..049b72ffde 100644 --- a/examples/builder/Cargo.toml +++ b/examples/builder/Cargo.toml @@ -12,7 +12,7 @@ lychee-lib = { path = "../../lychee-lib", version = "0.13.0", default-features = tokio = { version = "1.32.0", features = ["full"] } regex = "1.9.3" http = "0.2.9" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip"] } [features] email-check = ["lychee-lib/email-check"] diff --git a/examples/collect_links/Cargo.toml b/examples/collect_links/Cargo.toml index 6d82395e4c..8e6c83cb98 100644 --- a/examples/collect_links/Cargo.toml +++ b/examples/collect_links/Cargo.toml @@ -13,7 +13,7 @@ tokio = { version = "1.32.0", features = ["full"] } regex = "1.9.3" http = "0.2.9" tokio-stream = "0.1.14" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip"] } [features] email-check = ["lychee-lib/email-check"] diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index 2968408581..0c8249c639 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -38,7 +38,7 @@ once_cell = "1.18.0" openssl-sys = { version = "0.9.91", optional = true } pad = "0.1.6" regex = "1.9.3" -reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "json"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip", "json"] } reqwest_cookie_store = "0.6.0" # Make build work on Apple Silicon. # See https://github.com/briansmith/ring/issues/1163 diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 09a816d66a..91adecda3e 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -41,7 +41,7 @@ pulldown-cmark = "0.9.3" regex = "1.9.3" # Use trust-dns to avoid lookup failures on high concurrency # https://github.com/seanmonstar/reqwest/issues/296 -reqwest = { version = "0.11.18", default-features = false, features = ["gzip", "trust-dns", "cookies"] } +reqwest = { version = "0.11.19", default-features = false, features = ["gzip", "trust-dns", "cookies"] } reqwest_cookie_store = "0.6.0" # Make build work on Apple Silicon. # See https://github.com/briansmith/ring/issues/1163