diff --git a/Cargo.lock b/Cargo.lock index c0b51bce74..39d7ddeee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -730,6 +730,15 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "convert_case" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "cookie" version = "0.16.2" @@ -2106,6 +2115,7 @@ dependencies = [ "async-stream", "cached", "check-if-email-exists", + "convert_case", "doc-comment", "email_address", "futures", @@ -4078,6 +4088,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/README.md b/README.md index 3e274162ad..2051069c85 100644 --- a/README.md +++ b/README.md @@ -362,6 +362,9 @@ Options: -a, --accept Comma-separated list of accepted status codes for valid links + --include-fragments + Enable the checking of fragments in links + -t, --timeout Website timeout in seconds from connect to response finished diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 8d5e581fbd..3555fb0832 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -58,6 +58,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .accepted(accepted) .require_https(cfg.require_https) .cookie_jar(cookie_jar.cloned()) + .include_fragments(cfg.include_fragments) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 7b212d55c9..2bf7f1ce81 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -295,6 +295,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) accept: Option>, + /// Enable the checking of fragments in links. + #[arg(long)] + #[serde(default)] + pub(crate) include_fragments: bool, + /// Website timeout in seconds from connect to response finished #[arg(short, long, default_value = &TIMEOUT_STR)] #[serde(default = "timeout")] @@ -414,6 +419,7 @@ impl Config { output: None; require_https: false; cookie_jar: None; + include_fragments: false; } if self diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 369d100903..deeca8ba84 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -20,6 +20,7 @@ version = "0.13.0" async-stream = "0.3.5" cached = "0.44.0" check-if-email-exists = { version = "0.9.0", optional = true } +convert_case = "0.6.0" email_address = "0.2.4" futures = "0.3.27" glob = "0.3.1" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 35054b4d68..d9d62ced22 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -13,7 +13,7 @@ clippy::default_trait_access, clippy::used_underscore_binding )] -use std::{collections::HashSet, sync::Arc, time::Duration}; +use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; #[cfg(all(feature = "email-check", feature = "native-tls"))] use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; @@ -22,7 +22,7 @@ use http::{ header::{HeaderMap, HeaderValue, AUTHORIZATION}, StatusCode, }; -use log::debug; +use log::{debug, warn}; use octocrab::Octocrab; use regex::RegexSet; use reqwest::{header, redirect, Url}; @@ -36,6 +36,7 @@ use crate::{ remap::Remaps, retry::RetryExt, types::uri::github::GithubUri, + utils::fragment_checker::FragmentChecker, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; @@ -270,6 +271,9 @@ pub struct ClientBuilder { /// /// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store cookie_jar: Option>, + + /// Enable the checking of fragments in links. + include_fragments: bool, } impl Default for ClientBuilder { @@ -383,6 +387,8 @@ impl ClientBuilder { accepted: self.accepted, require_https: self.require_https, quirks, + include_fragments: self.include_fragments, + fragment_checker: Default::default(), }) } } @@ -429,6 +435,12 @@ pub struct Client { /// Override behaviors for certain known issues with special URIs. quirks: Quirks, + + /// Enable the checking of fragments in links. + include_fragments: bool, + + /// Caches Fragments + fragment_checker: FragmentChecker, } impl Client { @@ -472,7 +484,7 @@ impl Client { } let status = match uri.scheme() { - _ if uri.is_file() => self.check_file(uri), + _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, _ => self.check_website(uri, credentials).await?, }; @@ -659,13 +671,30 @@ impl Client { } /// Check a `file` URI. - pub fn check_file(&self, uri: &Uri) -> Status { - if let Ok(path) = uri.url.to_file_path() { - if path.exists() { - return Status::Ok(StatusCode::OK); + pub async fn check_file(&self, uri: &Uri) -> Status { + let Ok(path) = uri.url.to_file_path() else { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + }; + if !path.exists() { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + } + if self.include_fragments { + self.check_fragment(&path, uri).await + } else { + Status::Ok(StatusCode::OK) + } + } + + /// Checks a `file` URI's fragment. + pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { + match self.fragment_checker.check(path, uri).await { + Ok(true) => Status::Ok(StatusCode::OK), + Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), + Err(err) => { + warn!("Skipping fragment check due to the following error: {err}"); + Status::Ok(StatusCode::OK) } } - ErrorKind::InvalidFilePath(uri.clone()).into() } /// Check a mail address, or equivalently a `mailto` URI. diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 1867ec1ab8..82bd243b9f 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,3 +1,7 @@ +//! Extract things from markdown documents +use std::collections::{HashMap, HashSet}; + +use convert_case::{Case, Casing}; use pulldown_cmark::{Event, Parser, Tag}; use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; @@ -77,6 +81,61 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec HashSet { + let mut in_heading = false; + let mut heading = String::new(); + let mut id_generator = HeadingIdGenerator::default(); + + let mut out = HashSet::new(); + + for event in Parser::new(input) { + match event { + Event::Start(Tag::Heading(..)) => { + in_heading = true; + } + Event::End(Tag::Heading(_d, id, ..)) => { + if let Some(frag) = id { + out.insert(frag.to_string()); + } + + let id = id_generator.generate(&mut heading); + out.insert(id); + + in_heading = false; + heading.clear(); + } + Event::Text(text) => { + if in_heading { + heading.push_str(&text); + }; + } + + // Silently skip over other events + _ => (), + } + } + out +} + +#[derive(Default)] +struct HeadingIdGenerator { + counter: HashMap, +} + +impl HeadingIdGenerator { + fn generate(&mut self, heading: &mut String) -> String { + let mut id = heading.to_case(Case::Kebab); + let count = self.counter.entry(id.clone()).or_insert(0); + if *count != 0 { + id = format!("{}-{}", id, *count); + } + *count += 1; + + id + } +} + #[cfg(test)] mod tests { use super::*; @@ -148,12 +207,12 @@ or inline like `https://bar.org` for instance. #[test] #[ignore] fn test_skip_verbatim_html() { - let input = " + let input = " http://link.com
-Some pre-formatted http://pre.com 
+Some pre-formatted http://pre.com
 
"; let expected = vec![]; diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 3c6b1328ce..ed2047fe49 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -2,7 +2,7 @@ use crate::types::{uri::raw::RawUri, FileType, InputContent}; mod html5ever; mod html5gum; -mod markdown; +pub mod markdown; mod plaintext; use linkify::{LinkFinder, LinkKind}; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 161fac24fd..0844b45d55 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -63,6 +63,10 @@ pub enum ErrorKind { #[error("Cannot find file")] InvalidFilePath(Uri), + /// The given URI cannot be converted to a file path + #[error("Cannot find fragment")] + InvalidFragment(Uri), + /// The given path cannot be converted to a URI #[error("Invalid path to URL conversion: {0}")] InvalidUrlFromPath(PathBuf), @@ -256,6 +260,7 @@ impl Hash for ErrorKind { Self::InvalidUrlFromPath(p) => p.hash(state), Self::Utf8(e) => e.to_string().hash(state), Self::InvalidFilePath(u) => u.hash(state), + Self::InvalidFragment(u) => u.hash(state), Self::UnreachableEmailAddress(u, ..) => u.hash(state), Self::InsecureURL(u, ..) => u.hash(state), Self::InvalidBase(base, e) => (base, e).hash(state), diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs new file mode 100644 index 0000000000..0032d52e84 --- /dev/null +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -0,0 +1,58 @@ +use std::{ + collections::{hash_map::Entry, HashMap, HashSet}, + path::Path, + sync::Arc, +}; + +use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Uri}; +use tokio::{fs::File, io::AsyncReadExt, sync::Mutex}; +use url::Url; + +#[derive(Default, Clone, Debug)] +pub(crate) struct FragmentChecker { + cache: Arc>>>, +} + +impl FragmentChecker { + pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Result { + let (FileType::Markdown, Some(fragment)) = (FileType::from(path), uri.url.fragment()) else { + // If it is not a markdown file or if there is no fragment, return early. + return Ok(true) + }; + let url_without_frag = Self::remove_fragment(uri.url.clone()); + + let frag_exists = self + .check_cache_if_vacant_populate(url_without_frag, path, fragment) + .await?; + Ok(frag_exists) + } + + fn remove_fragment(url: Url) -> String { + let mut url = url; + url.set_fragment(None); + url.into() + } + + async fn check_cache_if_vacant_populate( + &self, + url_without_frag: String, + path: &Path, + fragment: &str, + ) -> Result { + let mut fragment_cache = self.cache.lock().await; + match fragment_cache.entry(url_without_frag.clone()) { + Entry::Vacant(entry) => { + let content = Self::read_file_content(path).await?; + let file_frags = extract_markdown_fragments(&content); + Ok(entry.insert(file_frags).contains(fragment)) + } + Entry::Occupied(entry) => Ok(entry.get().contains(fragment)), + } + } + + async fn read_file_content(path: &Path) -> Result { + let mut content = String::new(); + File::open(path).await?.read_to_string(&mut content).await?; + Ok(content) + } +} diff --git a/lychee-lib/src/utils/mod.rs b/lychee-lib/src/utils/mod.rs index fe6aec3356..d75d20c064 100644 --- a/lychee-lib/src/utils/mod.rs +++ b/lychee-lib/src/utils/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod fragment_checker; pub(crate) mod path; pub(crate) mod request; pub(crate) mod reqwest;