Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce false positives in URL blocklist to reduce scunthorpe problem … #5282

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion crates/api_common/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,9 @@ pub async fn get_url_blocklist(context: &LemmyContext) -> LemmyResult<RegexSet>
let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?;

// The urls are already validated on saving, so just escape them.
let regexes = urls.iter().map(|url| escape(&url.url));
// If this regex creation changes it must be synced with
// lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set.
let regexes = urls.iter().map(|url| format!(r"\b{}\b", escape(&url.url)));

let set = RegexSet::new(regexes)?;
Ok(set)
Expand Down
2 changes: 2 additions & 0 deletions crates/api_crud/src/site/update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ pub async fn update_site(
.ok();

if let Some(url_blocklist) = data.blocked_urls.clone() {
// If this validation changes it must be synced with
// lemmy_utils::utils::markdown::create_url_blocklist_test_regex_set.
let parsed_urls = check_urls_are_valid(&url_blocklist)?;
LocalSiteUrlBlocklist::replace(&mut context.pool(), parsed_urls).await?;
}
Expand Down
62 changes: 40 additions & 22 deletions crates/utils/src/utils/markdown/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> Lemm
mod tests {

use super::*;
use crate::utils::validation::check_urls_are_valid;
use image_links::markdown_rewrite_image_links;
use pretty_assertions::assert_eq;
use regex::escape;

#[test]
fn test_basic_markdown() {
Expand Down Expand Up @@ -191,9 +193,20 @@ mod tests {
});
}

// This replicates the logic when saving url blocklist patterns and querying them.
// Refer to lemmy_api_crud::site::update::update_site and
// lemmy_api_common::utils::get_url_blocklist().
fn create_url_blocklist_test_regex_set(patterns: Vec<&str>) -> LemmyResult<RegexSet> {
let url_blocklist = patterns.iter().map(|&s| s.to_string()).collect();
let valid_urls = check_urls_are_valid(&url_blocklist)?;
let regexes = valid_urls.iter().map(|p| format!(r"\b{}\b", escape(p)));
let set = RegexSet::new(regexes)?;
Ok(set)
}

#[test]
fn test_url_blocking() -> LemmyResult<()> {
let set = RegexSet::new(vec![r"(https://)?example\.com/?"])?;
let set = create_url_blocklist_test_regex_set(vec!["example.com/"])?;

assert!(
markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
Expand Down Expand Up @@ -221,37 +234,42 @@ mod tests {
)
.is_err());

let set = RegexSet::new(vec![r"(https://)?example\.com/spam\.jpg"])?;
assert!(markdown_check_for_blocked_urls(
&String::from("![](https://example.com/spam.jpg)"),
&set
)
.is_err());
let set = create_url_blocklist_test_regex_set(vec!["example.com/spam.jpg"])?;
assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg)", &set).is_err());
assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg1)", &set).is_ok());
// TODO: the following should not be matched, scunthorpe problem.
assert!(
markdown_check_for_blocked_urls("![](https://example.com/spam.jpg.html)", &set).is_err()
);

let set = RegexSet::new(vec![
r"(https://)?quo\.example\.com/?",
r"(https://)?foo\.example\.com/?",
r"(https://)?bar\.example\.com/?",
let set = create_url_blocklist_test_regex_set(vec![
r"quo.example.com/",
r"foo.example.com/",
r"bar.example.com/",
])?;

assert!(
markdown_check_for_blocked_urls(&String::from("https://baz.example.com"), &set).is_ok()
);
assert!(markdown_check_for_blocked_urls("https://baz.example.com", &set).is_ok());

assert!(
markdown_check_for_blocked_urls(&String::from("https://bar.example.com"), &set).is_err()
);
assert!(markdown_check_for_blocked_urls("https://bar.example.com", &set).is_err());

let set = RegexSet::new(vec![r"(https://)?example\.com/banned_page"])?;
let set = create_url_blocklist_test_regex_set(vec!["example.com/banned_page"])?;

assert!(
markdown_check_for_blocked_urls(&String::from("https://example.com/page"), &set).is_ok()
);
assert!(markdown_check_for_blocked_urls("https://example.com/page", &set).is_ok());

let set = RegexSet::new(vec![r"(https://)?ex\.mple\.com/?"])?;
let set = create_url_blocklist_test_regex_set(vec!["ex.mple.com/"])?;

assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());

let set = create_url_blocklist_test_regex_set(vec!["rt.com/"])?;

assert!(markdown_check_for_blocked_urls("deviantart.com", &set).is_ok());
assert!(markdown_check_for_blocked_urls("art.com.example.com", &set).is_ok());
assert!(markdown_check_for_blocked_urls("https://rt.com/abc", &set).is_err());
assert!(markdown_check_for_blocked_urls("go to rt.com.", &set).is_err());
assert!(markdown_check_for_blocked_urls("check out rt.computer", &set).is_ok());
// TODO: the following should not be matched, scunthorpe problem.
assert!(markdown_check_for_blocked_urls("rt.com.example.com", &set).is_err());

Ok(())
}

Expand Down