diff --git a/Cargo.lock b/Cargo.lock index f76436c8de..cc1ea9b6e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -689,6 +689,7 @@ dependencies = [ name = "collect_links" version = "0.1.0" dependencies = [ + "ada-url", "http", "lychee-lib", "regex", diff --git a/examples/collect_links/Cargo.toml b/examples/collect_links/Cargo.toml index 8e6c83cb98..a9d6e21359 100644 --- a/examples/collect_links/Cargo.toml +++ b/examples/collect_links/Cargo.toml @@ -9,6 +9,7 @@ path = "collect_links.rs" [dependencies] lychee-lib = { path = "../../lychee-lib", version = "0.13.0", default-features = false } +ada-url = { version = "1.4.3" } tokio = { version = "1.32.0", features = ["full"] } regex = "1.9.3" http = "0.2.9" diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index d6c466271d..acf142f170 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -1,5 +1,5 @@ +use ada_url::Url; use lychee_lib::{Collector, Input, InputSource, Result}; -use reqwest::Url; use std::path::PathBuf; use tokio_stream::StreamExt; @@ -10,7 +10,7 @@ async fn main() -> Result<()> { let inputs = vec![ Input { source: InputSource::RemoteUrl(Box::new( - Url::parse("https://github.com/lycheeverse/lychee").unwrap(), + Url::parse("https://github.com/lycheeverse/lychee", None).unwrap(), )), file_type_hint: None, excluded_paths: None, diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index c94b89de91..ba0e20accd 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -525,7 +525,7 @@ impl Client { credentials: &Option, ) -> Result { match self.check_website_inner(uri, credentials).await { - Status::Ok(code) if self.require_https && uri.scheme() == "http" => { + Status::Ok(code) if self.require_https && uri.scheme() == "http:" => { if self .check_website_inner(&uri.to_https(), credentials) .await @@ -559,7 +559,7 @@ impl Client { ) -> Status { // Workaround for upstream reqwest panic if validate_url(&uri.url) { - if matches!(uri.scheme(), "http" | "https") { + if matches!(uri.scheme(), "http:" | "https:") { // This is a truly invalid URI with a known scheme. // If we pass that to reqwest it would panic. return Status::Error(ErrorKind::InvalidURI(uri.clone())); @@ -674,10 +674,12 @@ impl Client { /// Check a `file` URI. pub async fn check_file(&self, uri: &Uri) -> Status { - let path = Path::new(uri.url.href()); + let path = Path::new(uri.url.pathname()); + if !path.exists() { return ErrorKind::InvalidFilePath(uri.clone()).into(); } + if self.include_fragments { self.check_fragment(path, uri).await } else { diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 4a79538c3a..79d63b85c9 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -110,8 +110,8 @@ impl Collector { mod tests { use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write}; + use ada_url::Url; use http::StatusCode; - use reqwest::Url; use super::*; use crate::{ @@ -184,7 +184,7 @@ mod tests { }, Input { source: InputSource::RemoteUrl(Box::new( - Url::parse(&mock_server.uri()) + Url::parse(&mock_server.uri(), None) .map_err(|e| (mock_server.uri(), e)) .unwrap(), )), @@ -358,7 +358,7 @@ mod tests { "#; let mock_server = mock_server!(StatusCode::OK, set_body_string(contents)); - let server_uri = Url::parse(&mock_server.uri()).unwrap(); + let server_uri = Url::parse(&mock_server.uri(), None).unwrap(); let input = Input { source: InputSource::RemoteUrl(Box::new(server_uri.clone())), diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 3cbd14ec7f..a106202ec5 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -57,7 +57,7 @@ impl Extractor { #[cfg(test)] mod tests { - use reqwest::Url; + use ada_url::Url; use std::{collections::HashSet, path::Path}; use super::*; @@ -189,7 +189,7 @@ mod tests { #[test] fn test_extract_relative_url() { let source = InputSource::RemoteUrl(Box::new( - Url::parse("https://example.com/some-post").unwrap(), + Url::parse("https://example.com/some-post", None).unwrap(), )); let contents = r#" diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs index dc79f58925..12bd6a211a 100644 --- a/lychee-lib/src/test_utils.rs +++ b/lychee-lib/src/test_utils.rs @@ -44,7 +44,7 @@ pub(crate) fn mail(address: &str) -> Uri { if address.starts_with("mailto:") { Url::parse(address, None) } else { - Url::parse(address, Some("mailto:")) + Url::parse(&*("mailto:".to_string() + address), None) } .expect("Expected valid Mail Address") .into() diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 210fd4393a..b1cfd38b8f 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -36,22 +36,14 @@ impl Base { } } - pub(crate) fn from_source(source: &InputSource) -> Option { + pub(crate) fn from_source(source: &InputSource) -> Option { match &source { InputSource::RemoteUrl(url) => { // TODO: This should be refactored. // Cases like https://user:pass@example.com are not handled // We can probably use the original URL and just replace the // path component in the caller of this function - if let Some(port) = url.port() { - Url::parse( - &format!("{}://{}:{port}", url.scheme(), url.host_str()?), - None, - ) - .ok() - } else { - Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?), None).ok() - } + Url::parse(&format!("{}//{}", url.protocol(), url.host()), None).ok() } // other inputs do not have a URL to extract a base _ => None, @@ -66,7 +58,7 @@ impl TryFrom<&str> for Base { if let Ok(url) = Url::parse(value, None) { // Cannot be a base // Character after scheme should be '/' - if url.href().chars().nth(url.protocol().len()) == Some('/') { + if url.href().chars().nth(url.protocol().len()) != Some('/') { return Err(ErrorKind::InvalidBase( value.to_string(), "The given URL cannot be a base".to_string(), @@ -127,7 +119,7 @@ mod test_base { ), ] { let url = Url::parse(url, None).unwrap(); - let source = InputSource::String(url.href().to_string()); + let source = InputSource::RemoteUrl(Box::from(url)); let base = Base::from_source(&source); let expected = Url::parse(expected, None).unwrap(); assert_eq!(base, Some(expected)); diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 98ae2546f9..efaf72ce45 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -54,7 +54,6 @@ impl> From

for FileType { } /// Helper function to check if a path is likely a URL. - fn is_url(path: &Path) -> bool { path.to_str() .and_then(|s| Url::parse(s, None).ok()) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 628d744ddb..fc65f83f01 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -1,10 +1,10 @@ use crate::types::FileType; use crate::{utils, ErrorKind, Result}; +use ada_url::Url; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; use jwalk::WalkDir; -use reqwest::Url; use serde::{Deserialize, Serialize}; use shellexpand::tilde; use std::fmt::Display; @@ -132,7 +132,7 @@ impl Input { ) -> Result { let source = if value == STDIN { InputSource::Stdin - } else if let Ok(url) = Url::parse(value) { + } else if let Ok(url) = Url::parse(value, None) { InputSource::RemoteUrl(Box::new(url)) } else { // this seems to be the only way to determine if this is a glob pattern @@ -159,7 +159,7 @@ impl Input { // by prefixing it with a `http://` scheme. // Curl also uses http (i.e. not https), see // https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124 - let url = Url::parse(&format!("http://{value}")).map_err(|e| { + let url = Url::parse(&format!("http://{value}"), None).map_err(|e| { ErrorKind::ParseUrl(e.to_string(), "Input is not a valid URL".to_string()) })?; InputSource::RemoteUrl(Box::new(url)) @@ -297,13 +297,13 @@ impl Input { async fn url_contents(url: &Url) -> Result { // Assume HTML for default paths - let file_type = if url.path().is_empty() || url.path() == "/" { + let file_type = if url.pathname().is_empty() || url.pathname() == "/" { FileType::Html } else { FileType::from(url.as_str()) }; - let res = reqwest::get(url.clone()) + let res = reqwest::get(url.href()) .await .map_err(ErrorKind::NetworkRequest)?; let input_content = InputContent { diff --git a/lychee-lib/src/types/uri/github.rs b/lychee-lib/src/types/uri/github.rs index 68ba3d97b3..5d6c195169 100644 --- a/lychee-lib/src/types/uri/github.rs +++ b/lychee-lib/src/types/uri/github.rs @@ -76,10 +76,11 @@ impl GithubUri { return Err(ErrorKind::InvalidGithubUrl(uri.to_string())); }; - let parts: Vec<_> = match uri.path_segments() { - Some(parts) => parts.collect(), - None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())), - }; + let parts = uri.path_segments(); + + if parts.is_empty() { + return Err(ErrorKind::InvalidGithubUrl(uri.to_string())); + } if parts.len() < 2 { // Not a valid org/repo pair. diff --git a/lychee-lib/src/types/uri/valid.rs b/lychee-lib/src/types/uri/valid.rs index f4a06b696c..f26f6f0eab 100644 --- a/lychee-lib/src/types/uri/valid.rs +++ b/lychee-lib/src/types/uri/valid.rs @@ -1,3 +1,4 @@ +use std::net::Ipv6Addr; use std::str::FromStr; use std::{convert::TryFrom, fmt::Display, net::IpAddr}; @@ -70,16 +71,28 @@ impl Uri { /// each as a percent-encoded ASCII string. /// /// Return `None` for cannot-be-a-base URLs. - pub fn path_segments(&self) -> Option> { - Some(self.url.pathname().split('/')) + pub fn path_segments(&self) -> Vec<&str> { + self.url + .pathname() + .split('/') + .filter(|p| !p.is_empty()) + .collect::>() } #[must_use] /// Returns the IP address (either IPv4 or IPv6) of the URI, /// or `None` if it is a domain pub fn host_ip(&self) -> Option { + let host = self.url.host(); match self.url.host_type() { HostType::Domain => None, + HostType::IPV6 => { + if let Ok(addr) = Ipv6Addr::from_str(&self.url.host()[1..host.len() - 1]) { + Some(addr.into()) + } else { + None + } + } _ => IpAddr::from_str(self.url.host()).ok(), } } @@ -131,9 +144,12 @@ impl Uri { pub fn is_loopback(&self) -> bool { match self.url.host_type() { HostType::Domain => false, - _ => IpAddr::from_str(self.url.host()) - .ok() - .is_some_and(|a| a.is_loopback()), + HostType::IPV6 => { + let host = self.url.host(); + Ipv6Addr::from_str(&host[1..host.len() - 1]) + .map_or(false, |addr| addr.is_loopback()) + } + _ => IpAddr::from_str(self.url.host()).map_or(false, |addr| addr.is_loopback()), } } @@ -163,11 +179,12 @@ impl Uri { /// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291 /// [IETF RFC 3879]: https://tools.ietf.org/html/rfc3879 pub fn is_private(&self) -> bool { + let host = self.url.host(); match self.url.host_type() { - HostType::IPV4 => std::net::Ipv4Addr::from_str(self.url.host()) + HostType::IPV4 => std::net::Ipv4Addr::from_str(host) .ok() .is_some_and(|a| a.is_private()), - HostType::IPV6 => std::net::Ipv6Addr::from_str(self.url.host()) + HostType::IPV6 => std::net::Ipv6Addr::from_str(&host[1..host.len() - 1]) .ok() .is_some_and(|a| Ipv6Network::from(a).is_unique_local()), HostType::Domain => false, @@ -190,12 +207,13 @@ impl Uri { /// [IETF RFC 3927]: https://tools.ietf.org/html/rfc3927 /// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291 pub fn is_link_local(&self) -> bool { + let host = self.url.host(); match self.url.host_type() { - HostType::IPV4 => std::net::Ipv4Addr::from_str(self.url.host()) + HostType::IPV4 => std::net::Ipv4Addr::from_str(host) .ok() .is_some_and(|a| a.is_link_local()), HostType::IPV6 => { - if let Ok(addr) = std::net::Ipv6Addr::from_str(self.url.host()) { + if let Ok(addr) = Ipv6Addr::from_str(&host[1..host.len() - 1]) { Ipv6Network::from(addr).is_unicast_link_local() } else { false