Skip to content

Commit

Permalink
Fragments are now included in file links.
Browse files Browse the repository at this point in the history
These fragments are ignored during the file link check.
This commit allows their use in specific cases in the future.
  • Loading branch information
HU90m committed Jul 13, 2023
1 parent 255e9c6 commit ac29cb0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 32 deletions.
23 changes: 18 additions & 5 deletions lychee-lib/src/utils/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,19 @@ pub(crate) fn create(
credentials,
)))
} else if let InputSource::FsPath(root) = &input_content.source {
if is_anchor {
// Silently ignore anchor links for now
Ok(None)
} else if let Some(url) = create_uri_from_path(root, &text, base)? {
let path = if is_anchor {
// If the link is just an anchor,
// prepend the filename of the file from which the link came.
root.file_name()
.expect("File doesn't have a file name.")
.to_str()
.expect("Filename is invalid unicode")
.to_string()
+ &text
} else {
text
};
if let Some(url) = create_uri_from_path(root, &path, base)? {
let uri = Uri { url };
let credentials = credentials(extractor, &uri);

Expand Down Expand Up @@ -122,7 +131,7 @@ fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
}

fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
let dst = url::remove_get_params_and_fragment(dst);
let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
Expand All @@ -136,6 +145,10 @@ fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Op
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
match resolved {
Some(path) => Url::from_file_path(&path)
.map(|mut url| {
url.set_fragment(frag);
url
})
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),
Expand Down
56 changes: 29 additions & 27 deletions lychee-lib/src/utils/url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ use once_cell::sync::Lazy;

static LINK_FINDER: Lazy<LinkFinder> = Lazy::new(LinkFinder::new);

/// Remove all GET parameters from a URL.
/// Remove all GET parameters from a URL and seperates out the fragment.
/// The link is not a URL but a String as it may not have a base domain.
pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
let path = match url.split_once('#') {
Some((path_without_fragment, _fragment)) => path_without_fragment,
None => url,
pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) {
let (path, frag) = match url.split_once('#') {
Some((path, fragment)) => (path, Some(fragment)),
None => (url, None),
};
let path = match path.split_once('?') {
Some((path_without_params, _params)) => path_without_params,
None => path,
};
path
(path, frag)
}

// Use `LinkFinder` to offload the raw link searching in plaintext
Expand All @@ -29,47 +29,49 @@ mod test_fs_tree {

#[test]
fn test_remove_get_params_and_fragment() {
assert_eq!(remove_get_params_and_fragment("/"), "/");
assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None));
assert_eq!(
remove_get_params_and_fragment("index.html?foo=bar"),
"index.html"
remove_get_params_and_seperate_fragment("index.html?foo=bar"),
("index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar"),
"/index.html"
remove_get_params_and_seperate_fragment("/index.html?foo=bar"),
("/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html"
remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
("/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html?foo=bar"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"),
("https://example.com/index.html", None)
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar"),
("test.png", None)
);

assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html#anchor"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"),
("https://example.com/index.html", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("https://example.com/index.html?foo=bar#anchor"),
"https://example.com/index.html"
remove_get_params_and_seperate_fragment(
"https://example.com/index.html?foo=bar#anchor"
),
("https://example.com/index.html", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"),
("test.png", Some("anchor"))
);
assert_eq!(
remove_get_params_and_fragment("test.png#anchor?anchor!?"),
"test.png"
remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"),
("test.png", Some("anchor?anchor!?"))
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
"test.png"
remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"),
("test.png", Some("anchor?anchor!"))
);
}
}

0 comments on commit ac29cb0

Please sign in to comment.