Skip to content

Commit

Permalink
Links to markdown files with fragments now have the fragments checked.
Browse files Browse the repository at this point in the history
Fragments/anchors are generated from markdown files using
*unique kebab case* and the *heading attributes* commonmark extension.
  • Loading branch information
HU90m committed Jul 14, 2023
1 parent b8d80a1 commit 48d4972
Show file tree
Hide file tree
Showing 11 changed files with 190 additions and 11 deletions.
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ Options:
-a, --accept <ACCEPT>
Comma-separated list of accepted status codes for valid links
--include-fragments
Enable the checking of fragments in links
-t, --timeout <TIMEOUT>
Website timeout in seconds from connect to response finished
Expand Down
1 change: 1 addition & 0 deletions lychee-bin/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
.accepted(accepted)
.require_https(cfg.require_https)
.cookie_jar(cookie_jar.cloned())
.include_fragments(cfg.include_fragments)
.build()
.client()
.context("Failed to create request client")
Expand Down
6 changes: 6 additions & 0 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) accept: Option<HashSet<u16>>,

/// Enable the checking of fragments in links.
#[arg(long)]
#[serde(default)]
pub(crate) include_fragments: bool,

/// Website timeout in seconds from connect to response finished
#[arg(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
Expand Down Expand Up @@ -414,6 +419,7 @@ impl Config {
output: None;
require_https: false;
cookie_jar: None;
include_fragments: false;
}

if self
Expand Down
1 change: 1 addition & 0 deletions lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ version = "0.13.0"
async-stream = "0.3.5"
cached = "0.44.0"
check-if-email-exists = { version = "0.9.0", optional = true }
convert_case = "0.6.0"
email_address = "0.2.4"
futures = "0.3.27"
glob = "0.3.1"
Expand Down
45 changes: 37 additions & 8 deletions lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
clippy::default_trait_access,
clippy::used_underscore_binding
)]
use std::{collections::HashSet, sync::Arc, time::Duration};
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};

#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
Expand All @@ -22,7 +22,7 @@ use http::{
header::{HeaderMap, HeaderValue, AUTHORIZATION},
StatusCode,
};
use log::debug;
use log::{debug, warn};
use octocrab::Octocrab;
use regex::RegexSet;
use reqwest::{header, redirect, Url};
Expand All @@ -36,6 +36,7 @@ use crate::{
remap::Remaps,
retry::RetryExt,
types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};

Expand Down Expand Up @@ -270,6 +271,9 @@ pub struct ClientBuilder {
///
/// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store
cookie_jar: Option<Arc<CookieStoreMutex>>,

/// Enable the checking of fragments in links.
include_fragments: bool,
}

impl Default for ClientBuilder {
Expand Down Expand Up @@ -383,6 +387,8 @@ impl ClientBuilder {
accepted: self.accepted,
require_https: self.require_https,
quirks,
include_fragments: self.include_fragments,
fragment_checker: Default::default(),
})
}
}
Expand Down Expand Up @@ -429,6 +435,12 @@ pub struct Client {

/// Override behaviors for certain known issues with special URIs.
quirks: Quirks,

/// Enable the checking of fragments in links.
include_fragments: bool,

/// Caches Fragments
fragment_checker: FragmentChecker,
}

impl Client {
Expand Down Expand Up @@ -472,7 +484,7 @@ impl Client {
}

let status = match uri.scheme() {
_ if uri.is_file() => self.check_file(uri),
_ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
_ => self.check_website(uri, credentials).await?,
};
Expand Down Expand Up @@ -659,13 +671,30 @@ impl Client {
}

/// Check a `file` URI.
pub fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
return Status::Ok(StatusCode::OK);
pub async fn check_file(&self, uri: &Uri) -> Status {
let Ok(path) = uri.url.to_file_path() else {
return ErrorKind::InvalidFilePath(uri.clone()).into();
};
if !path.exists() {
return ErrorKind::InvalidFilePath(uri.clone()).into();
}
if self.include_fragments {
self.check_fragment(&path, uri).await
} else {
Status::Ok(StatusCode::OK)
}
}

/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, uri).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
}
}
ErrorKind::InvalidFilePath(uri.clone()).into()
}

/// Check a mail address, or equivalently a `mailto` URI.
Expand Down
63 changes: 61 additions & 2 deletions lychee-lib/src/extract/markdown.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
//! Extract things from markdown documents
use std::collections::{HashMap, HashSet};

use convert_case::{Case, Casing};
use pulldown_cmark::{Event, Parser, Tag};

use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
Expand Down Expand Up @@ -77,6 +81,61 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
.collect()
}

/// Extract unparsed URL strings from a Markdown string.
pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
let mut in_heading = false;
let mut heading = String::new();
let mut id_generator = HeadingIdGenerator::default();

let mut out = HashSet::new();

for event in Parser::new(input) {
match event {
Event::Start(Tag::Heading(..)) => {
in_heading = true;
}
Event::End(Tag::Heading(_d, id, ..)) => {
if let Some(frag) = id {
out.insert(frag.to_string());
}

let id = id_generator.generate(&mut heading);
out.insert(id);

in_heading = false;
heading.clear();
}
Event::Text(text) => {
if in_heading {
heading.push_str(&text);
};
}

// Silently skip over other events
_ => (),
}
}
out
}

#[derive(Default)]
struct HeadingIdGenerator {
counter: HashMap<String, usize>,
}

impl HeadingIdGenerator {
fn generate(&mut self, heading: &mut String) -> String {
let mut id = heading.to_case(Case::Kebab);
let count = self.counter.entry(id.clone()).or_insert(0);
if *count != 0 {
id = format!("{}-{}", id, *count);
}
*count += 1;

id
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -148,12 +207,12 @@ or inline like `https://bar.org` for instance.
#[test]
#[ignore]
fn test_skip_verbatim_html() {
let input = "
let input = "
<code>
http://link.com
</code>
<pre>
Some pre-formatted http://pre.com
Some pre-formatted http://pre.com
</pre>";

let expected = vec![];
Expand Down
2 changes: 1 addition & 1 deletion lychee-lib/src/extract/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::types::{uri::raw::RawUri, FileType, InputContent};

mod html5ever;
mod html5gum;
mod markdown;
pub mod markdown;
mod plaintext;

use linkify::{LinkFinder, LinkKind};
Expand Down
5 changes: 5 additions & 0 deletions lychee-lib/src/types/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ pub enum ErrorKind {
#[error("Cannot find file")]
InvalidFilePath(Uri),

/// The given URI cannot be converted to a file path
#[error("Cannot find fragment")]
InvalidFragment(Uri),

/// The given path cannot be converted to a URI
#[error("Invalid path to URL conversion: {0}")]
InvalidUrlFromPath(PathBuf),
Expand Down Expand Up @@ -256,6 +260,7 @@ impl Hash for ErrorKind {
Self::InvalidUrlFromPath(p) => p.hash(state),
Self::Utf8(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) => u.hash(state),
Self::InvalidFragment(u) => u.hash(state),
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
Self::InsecureURL(u, ..) => u.hash(state),
Self::InvalidBase(base, e) => (base, e).hash(state),
Expand Down
58 changes: 58 additions & 0 deletions lychee-lib/src/utils/fragment_checker.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use std::{
collections::{hash_map::Entry, HashMap, HashSet},
path::Path,
sync::Arc,
};

use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Uri};
use tokio::{fs::File, io::AsyncReadExt, sync::Mutex};
use url::Url;

#[derive(Default, Clone, Debug)]
pub(crate) struct FragmentChecker {
cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
}

impl FragmentChecker {
pub(crate) async fn check(&self, path: &Path, uri: &Uri) -> Result<bool, std::io::Error> {
let (FileType::Markdown, Some(fragment)) = (FileType::from(path), uri.url.fragment()) else {
// If it is not a markdown file or if there is no fragment, return early.
return Ok(true)
};
let url_without_frag = Self::remove_fragment(uri.url.clone());

let frag_exists = self
.check_cache_if_vacant_populate(url_without_frag, path, fragment)
.await?;
Ok(frag_exists)
}

fn remove_fragment(url: Url) -> String {
let mut url = url;
url.set_fragment(None);
url.into()
}

async fn check_cache_if_vacant_populate(
&self,
url_without_frag: String,
path: &Path,
fragment: &str,
) -> Result<bool, std::io::Error> {
let mut fragment_cache = self.cache.lock().await;
match fragment_cache.entry(url_without_frag.clone()) {
Entry::Vacant(entry) => {
let content = Self::read_file_content(path).await?;
let file_frags = extract_markdown_fragments(&content);
Ok(entry.insert(file_frags).contains(fragment))
}
Entry::Occupied(entry) => Ok(entry.get().contains(fragment)),
}
}

async fn read_file_content(path: &Path) -> Result<String, std::io::Error> {
let mut content = String::new();
File::open(path).await?.read_to_string(&mut content).await?;
Ok(content)
}
}
1 change: 1 addition & 0 deletions lychee-lib/src/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub(crate) mod fragment_checker;
pub(crate) mod path;
pub(crate) mod request;
pub(crate) mod reqwest;
Expand Down

0 comments on commit 48d4972

Please sign in to comment.