From 7ba8274c675bbb31ac7a2618e227b380561fc5ae Mon Sep 17 00:00:00 2001 From: Gabe Berke-Williams Date: Sat, 31 Aug 2019 20:28:15 -0700 Subject: [PATCH] Allow multiple selectors with their own operations --- README.md | 19 +++++++++++-------- src/main.rs | 53 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index de775da..19dca67 100644 --- a/README.md +++ b/README.md @@ -37,23 +37,26 @@ To get an attribute, use `attr{ATTRIBUTE_NAME}`. Now let's parse a real webpage: - curl https://daringfireball.net | candle 'dl.linkedlist dt a:not([title]) {text}' + curl https://daringfireball.net | candle 'dl a:not([title]) {text}' Jack Dorsey’s Twitter Account Was Compromised NetNewsWire 5.0 - Filmmaker Mode (a.k.a. Death to Motion Smoothing) - Apple Expands Third-Party Repair Program - Apple Sends Invitations for September 10 Event We can show the `href` attribute instead: - curl https://daringfireball.net | candle 'dl.linkedlist dt a:not([title]) attr{href}' + curl https://daringfireball.net | candle 'dl a:not([title]) attr{href}' https://techcrunch.com/2019/08/30/someone-hacked-jack-dorseys-own-twitter-account/ https://inessential.com/2019/08/26/netnewswire_5_0_now_available - https://www.experienceuhd.com/filmmakermode - https://www.apple.com/newsroom/2019/08/apple-offers-customers-even-more-options-for-safe-reliable-repairs/ - https://www.loopinsight.com/2019/08/29/apple-sends-invite-for-september-10-event/ + +Or we can show both the text and the `href`: + + curl https://daringfireball.net | candle 'dl a:not([title]) attr{href}, dl a:not([title]) {text}' + + https://techcrunch.com/2019/08/30/someone-hacked-jack-dorseys-own-twitter-account/ + https://inessential.com/2019/08/26/netnewswire_5_0_now_available + Jack Dorsey’s Twitter Account Was Compromised + NetNewsWire 5.0 ## Inspiration diff --git a/src/main.rs b/src/main.rs index 756118b..45431f4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,6 +11,12 @@ struct Inputs { html: String, } +struct Finder<'a> { + selector: &'a str, + attr: Option<&'a str>, + is_text: bool, +} + fn read_from_stdin() -> Option { // It might not be valid UTF-8, so read to a vector of bytes and convert it to UTF-8, lossily let mut buffer: Vec = Vec::new(); @@ -64,30 +70,40 @@ fn main() { } } -fn select(document: scraper::Html, captures: regex::Captures) -> Result, String> { - let selector = Selector::parse(captures.name("selector").unwrap().as_str()) +fn select_all(document: scraper::Html, finders: Vec) -> Result, String> { + let mut results: Vec = Vec::new(); + for finder in finders { + results.extend(select(&document, &finder)?); + } + Ok(results) +} + +fn select(document: &scraper::Html, finder: &Finder) -> Result, String> { + let selector = Selector::parse(finder.selector) .map_err(|e| format!("Bad CSS selector: {:?}", e.kind))?; let selected = document.select(&selector); - if captures.name("text").is_some() { + if finder.is_text { Ok(selected.map(|element| element.text().collect()).collect()) - } else if let Some(attr) = captures.name("attr") { + } else if let Some(attr) = finder.attr { Ok(selected - .filter_map(|element| element.value().attr(attr.as_str()).map(|s| s.to_string())) + .filter_map(|element| element.value().attr(attr).map(|s| s.to_string())) .collect()) } else { - Err("Unknown request".to_string()) + Err("Unknown finder (not `{text}` or `attr{...}`".to_string()) } } fn parse(inputs: Inputs) -> Result, String> { let document = Html::parse_document(&inputs.html); - let re = Regex::new(r"(?P.+) (?:(?P\{text\})|(attr\{(?P[^}]+)\}))$").unwrap(); - match re.captures(&inputs.selector) { - Some(captures) => select(document, captures), - None => { - Err("Please specify {text} or attr{ATTRIBUTE}".to_string()) - } + let re = Regex::new(r"(?P[^{}]+) (?:(?P\{text\})|(attr\{(?P[^}]+)\}))[,]?\s*").unwrap(); + let finders: Vec = re.captures_iter(&inputs.selector) + .map(|c| Finder { selector: c.name("selector").unwrap().as_str(), is_text: c.name("text").is_some(), attr: c.name("attr").map(|a| a.as_str()) }).collect(); + + if finders.len() == 0 { + Err("Please specify {text} or attr{ATTRIBUTE}".to_string()) + } else { + select_all(document, finders) } } @@ -134,6 +150,19 @@ mod test { assert_eq!(result, Ok(vec!("foo".to_string()))); } + #[test] + fn test_multiple_selectors(){ + let html = r#" + + + Hello, world! +

Hello, world!

+ "#; + let selector = "h1 attr{class}, h1 {text}"; + let result = parse(build_inputs(html, selector)); + assert_eq!(result, Ok(vec!("foo".to_string(), "Hello, world!".to_string()))); + } + #[test] fn test_no_text_or_attr_specification(){ let html = r#"