Skip to content

Commit

Permalink
fix: handle more test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
anonrig committed Aug 27, 2023
1 parent 129428b commit 4c66321
Show file tree
Hide file tree
Showing 12 changed files with 56 additions and 42 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions examples/collect_links/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ path = "collect_links.rs"

[dependencies]
lychee-lib = { path = "../../lychee-lib", version = "0.13.0", default-features = false }
ada-url = { version = "1.4.3" }
tokio = { version = "1.32.0", features = ["full"] }
regex = "1.9.3"
http = "0.2.9"
Expand Down
4 changes: 2 additions & 2 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use ada_url::Url;
use lychee_lib::{Collector, Input, InputSource, Result};
use reqwest::Url;
use std::path::PathBuf;
use tokio_stream::StreamExt;

Expand All @@ -10,7 +10,7 @@ async fn main() -> Result<()> {
let inputs = vec![
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
Url::parse("https://github.com/lycheeverse/lychee", None).unwrap(),
)),
file_type_hint: None,
excluded_paths: None,
Expand Down
8 changes: 5 additions & 3 deletions lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ impl Client {
credentials: &Option<BasicAuthCredentials>,
) -> Result<Status> {
match self.check_website_inner(uri, credentials).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
Status::Ok(code) if self.require_https && uri.scheme() == "http:" => {
if self
.check_website_inner(&uri.to_https(), credentials)
.await
Expand Down Expand Up @@ -559,7 +559,7 @@ impl Client {
) -> Status {
// Workaround for upstream reqwest panic
if validate_url(&uri.url) {
if matches!(uri.scheme(), "http" | "https") {
if matches!(uri.scheme(), "http:" | "https:") {
// This is a truly invalid URI with a known scheme.
// If we pass that to reqwest it would panic.
return Status::Error(ErrorKind::InvalidURI(uri.clone()));
Expand Down Expand Up @@ -674,10 +674,12 @@ impl Client {

/// Check a `file` URI.
pub async fn check_file(&self, uri: &Uri) -> Status {
let path = Path::new(uri.url.href());
let path = Path::new(uri.url.pathname());

if !path.exists() {
return ErrorKind::InvalidFilePath(uri.clone()).into();
}

if self.include_fragments {
self.check_fragment(path, uri).await
} else {
Expand Down
6 changes: 3 additions & 3 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ impl Collector {
mod tests {
use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write};

use ada_url::Url;
use http::StatusCode;
use reqwest::Url;

use super::*;
use crate::{
Expand Down Expand Up @@ -184,7 +184,7 @@ mod tests {
},
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&mock_server.uri())
Url::parse(&mock_server.uri(), None)
.map_err(|e| (mock_server.uri(), e))
.unwrap(),
)),
Expand Down Expand Up @@ -358,7 +358,7 @@ mod tests {
</html>"#;
let mock_server = mock_server!(StatusCode::OK, set_body_string(contents));

let server_uri = Url::parse(&mock_server.uri()).unwrap();
let server_uri = Url::parse(&mock_server.uri(), None).unwrap();

let input = Input {
source: InputSource::RemoteUrl(Box::new(server_uri.clone())),
Expand Down
4 changes: 2 additions & 2 deletions lychee-lib/src/extract/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ impl Extractor {

#[cfg(test)]
mod tests {
use reqwest::Url;
use ada_url::Url;
use std::{collections::HashSet, path::Path};

use super::*;
Expand Down Expand Up @@ -189,7 +189,7 @@ mod tests {
#[test]
fn test_extract_relative_url() {
let source = InputSource::RemoteUrl(Box::new(
Url::parse("https://example.com/some-post").unwrap(),
Url::parse("https://example.com/some-post", None).unwrap(),
));

let contents = r#"<html>
Expand Down
2 changes: 1 addition & 1 deletion lychee-lib/src/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub(crate) fn mail(address: &str) -> Uri {
if address.starts_with("mailto:") {
Url::parse(address, None)
} else {
Url::parse(address, Some("mailto:"))
Url::parse(&*("mailto:".to_string() + address), None)
}
.expect("Expected valid Mail Address")
.into()
Expand Down
16 changes: 4 additions & 12 deletions lychee-lib/src/types/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,14 @@ impl Base {
}
}

pub(crate) fn from_source(source: &InputSource) -> Option<ada_url::Url> {
pub(crate) fn from_source(source: &InputSource) -> Option<Url> {
match &source {
InputSource::RemoteUrl(url) => {
// TODO: This should be refactored.
// Cases like https://user:[email protected] are not handled
// We can probably use the original URL and just replace the
// path component in the caller of this function
if let Some(port) = url.port() {
Url::parse(
&format!("{}://{}:{port}", url.scheme(), url.host_str()?),
None,
)
.ok()
} else {
Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?), None).ok()
}
Url::parse(&format!("{}//{}", url.protocol(), url.host()), None).ok()
}
// other inputs do not have a URL to extract a base
_ => None,
Expand All @@ -66,7 +58,7 @@ impl TryFrom<&str> for Base {
if let Ok(url) = Url::parse(value, None) {
// Cannot be a base
// Character after scheme should be '/'
if url.href().chars().nth(url.protocol().len()) == Some('/') {
if url.href().chars().nth(url.protocol().len()) != Some('/') {
return Err(ErrorKind::InvalidBase(
value.to_string(),
"The given URL cannot be a base".to_string(),
Expand Down Expand Up @@ -127,7 +119,7 @@ mod test_base {
),
] {
let url = Url::parse(url, None).unwrap();
let source = InputSource::String(url.href().to_string());
let source = InputSource::RemoteUrl(Box::from(url));
let base = Base::from_source(&source);
let expected = Url::parse(expected, None).unwrap();
assert_eq!(base, Some(expected));
Expand Down
1 change: 0 additions & 1 deletion lychee-lib/src/types/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ impl<P: AsRef<Path>> From<P> for FileType {
}

/// Helper function to check if a path is likely a URL.

fn is_url(path: &Path) -> bool {
path.to_str()
.and_then(|s| Url::parse(s, None).ok())
Expand Down
10 changes: 5 additions & 5 deletions lychee-lib/src/types/input.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use crate::types::FileType;
use crate::{utils, ErrorKind, Result};
use ada_url::Url;
use async_stream::try_stream;
use futures::stream::Stream;
use glob::glob_with;
use jwalk::WalkDir;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use shellexpand::tilde;
use std::fmt::Display;
Expand Down Expand Up @@ -132,7 +132,7 @@ impl Input {
) -> Result<Self> {
let source = if value == STDIN {
InputSource::Stdin
} else if let Ok(url) = Url::parse(value) {
} else if let Ok(url) = Url::parse(value, None) {
InputSource::RemoteUrl(Box::new(url))
} else {
// this seems to be the only way to determine if this is a glob pattern
Expand All @@ -159,7 +159,7 @@ impl Input {
// by prefixing it with a `http://` scheme.
// Curl also uses http (i.e. not https), see
// https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
let url = Url::parse(&format!("http://{value}")).map_err(|e| {
let url = Url::parse(&format!("http://{value}"), None).map_err(|e| {
ErrorKind::ParseUrl(e.to_string(), "Input is not a valid URL".to_string())
})?;
InputSource::RemoteUrl(Box::new(url))
Expand Down Expand Up @@ -297,13 +297,13 @@ impl Input {

async fn url_contents(url: &Url) -> Result<InputContent> {
// Assume HTML for default paths
let file_type = if url.path().is_empty() || url.path() == "/" {
let file_type = if url.pathname().is_empty() || url.pathname() == "/" {
FileType::Html
} else {
FileType::from(url.as_str())
};

let res = reqwest::get(url.clone())
let res = reqwest::get(url.href())
.await
.map_err(ErrorKind::NetworkRequest)?;
let input_content = InputContent {
Expand Down
9 changes: 5 additions & 4 deletions lychee-lib/src/types/uri/github.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,11 @@ impl GithubUri {
return Err(ErrorKind::InvalidGithubUrl(uri.to_string()));
};

let parts: Vec<_> = match uri.path_segments() {
Some(parts) => parts.collect(),
None => return Err(ErrorKind::InvalidGithubUrl(uri.to_string())),
};
let parts = uri.path_segments();

if parts.is_empty() {
return Err(ErrorKind::InvalidGithubUrl(uri.to_string()));
}

if parts.len() < 2 {
// Not a valid org/repo pair.
Expand Down
36 changes: 27 additions & 9 deletions lychee-lib/src/types/uri/valid.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::net::Ipv6Addr;
use std::str::FromStr;
use std::{convert::TryFrom, fmt::Display, net::IpAddr};

Expand Down Expand Up @@ -70,16 +71,28 @@ impl Uri {
/// each as a percent-encoded ASCII string.
///
/// Return `None` for cannot-be-a-base URLs.
pub fn path_segments(&self) -> Option<std::str::Split<char>> {
Some(self.url.pathname().split('/'))
pub fn path_segments(&self) -> Vec<&str> {
self.url
.pathname()
.split('/')
.filter(|p| !p.is_empty())
.collect::<Vec<_>>()
}

#[must_use]
/// Returns the IP address (either IPv4 or IPv6) of the URI,
/// or `None` if it is a domain
pub fn host_ip(&self) -> Option<IpAddr> {
let host = self.url.host();
match self.url.host_type() {
HostType::Domain => None,
HostType::IPV6 => {
if let Ok(addr) = Ipv6Addr::from_str(&self.url.host()[1..host.len() - 1]) {
Some(addr.into())
} else {
None
}
}
_ => IpAddr::from_str(self.url.host()).ok(),
}
}
Expand Down Expand Up @@ -131,9 +144,12 @@ impl Uri {
pub fn is_loopback(&self) -> bool {
match self.url.host_type() {
HostType::Domain => false,
_ => IpAddr::from_str(self.url.host())
.ok()
.is_some_and(|a| a.is_loopback()),
HostType::IPV6 => {
let host = self.url.host();
Ipv6Addr::from_str(&host[1..host.len() - 1])
.map_or(false, |addr| addr.is_loopback())
}
_ => IpAddr::from_str(self.url.host()).map_or(false, |addr| addr.is_loopback()),
}
}

Expand Down Expand Up @@ -163,11 +179,12 @@ impl Uri {
/// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291
/// [IETF RFC 3879]: https://tools.ietf.org/html/rfc3879
pub fn is_private(&self) -> bool {
let host = self.url.host();
match self.url.host_type() {
HostType::IPV4 => std::net::Ipv4Addr::from_str(self.url.host())
HostType::IPV4 => std::net::Ipv4Addr::from_str(host)
.ok()
.is_some_and(|a| a.is_private()),
HostType::IPV6 => std::net::Ipv6Addr::from_str(self.url.host())
HostType::IPV6 => std::net::Ipv6Addr::from_str(&host[1..host.len() - 1])
.ok()
.is_some_and(|a| Ipv6Network::from(a).is_unique_local()),
HostType::Domain => false,
Expand All @@ -190,12 +207,13 @@ impl Uri {
/// [IETF RFC 3927]: https://tools.ietf.org/html/rfc3927
/// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291
pub fn is_link_local(&self) -> bool {
let host = self.url.host();
match self.url.host_type() {
HostType::IPV4 => std::net::Ipv4Addr::from_str(self.url.host())
HostType::IPV4 => std::net::Ipv4Addr::from_str(host)
.ok()
.is_some_and(|a| a.is_link_local()),
HostType::IPV6 => {
if let Ok(addr) = std::net::Ipv6Addr::from_str(self.url.host()) {
if let Ok(addr) = Ipv6Addr::from_str(&host[1..host.len() - 1]) {
Ipv6Network::from(addr).is_unicast_link_local()
} else {
false
Expand Down

0 comments on commit 4c66321

Please sign in to comment.