Skip to content

Commit

Permalink
Allow multiple selectors with their own operations
Browse files Browse the repository at this point in the history
  • Loading branch information
gabebw committed Sep 1, 2019
1 parent 32497ae commit 7ba8274
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 20 deletions.
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,26 @@ To get an attribute, use `attr{ATTRIBUTE_NAME}`.

Now let's parse a real webpage:

curl https://daringfireball.net | candle 'dl.linkedlist dt a:not([title]) {text}'
curl https://daringfireball.net | candle 'dl a:not([title]) {text}'

Jack Dorsey’s Twitter Account Was Compromised
NetNewsWire 5.0
Filmmaker Mode (a.k.a. Death to Motion Smoothing)
Apple Expands Third-Party Repair Program
Apple Sends Invitations for September 10 Event

We can show the `href` attribute instead:

curl https://daringfireball.net | candle 'dl.linkedlist dt a:not([title]) attr{href}'
curl https://daringfireball.net | candle 'dl a:not([title]) attr{href}'

https://techcrunch.com/2019/08/30/someone-hacked-jack-dorseys-own-twitter-account/
https://inessential.com/2019/08/26/netnewswire_5_0_now_available
https://www.experienceuhd.com/filmmakermode
https://www.apple.com/newsroom/2019/08/apple-offers-customers-even-more-options-for-safe-reliable-repairs/
https://www.loopinsight.com/2019/08/29/apple-sends-invite-for-september-10-event/

Or we can show both the text and the `href`:

curl https://daringfireball.net | candle 'dl a:not([title]) attr{href}, dl a:not([title]) {text}'

https://techcrunch.com/2019/08/30/someone-hacked-jack-dorseys-own-twitter-account/
https://inessential.com/2019/08/26/netnewswire_5_0_now_available
Jack Dorsey’s Twitter Account Was Compromised
NetNewsWire 5.0

## Inspiration

Expand Down
53 changes: 41 additions & 12 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ struct Inputs {
html: String,
}

struct Finder<'a> {
selector: &'a str,
attr: Option<&'a str>,
is_text: bool,
}

fn read_from_stdin() -> Option<String> {
// It might not be valid UTF-8, so read to a vector of bytes and convert it to UTF-8, lossily
let mut buffer: Vec<u8> = Vec::new();
Expand Down Expand Up @@ -64,30 +70,40 @@ fn main() {
}
}

fn select(document: scraper::Html, captures: regex::Captures) -> Result<Vec<String>, String> {
let selector = Selector::parse(captures.name("selector").unwrap().as_str())
fn select_all(document: scraper::Html, finders: Vec<Finder>) -> Result<Vec<String>, String> {
let mut results: Vec<String> = Vec::new();
for finder in finders {
results.extend(select(&document, &finder)?);
}
Ok(results)
}

fn select(document: &scraper::Html, finder: &Finder) -> Result<Vec<String>, String> {
let selector = Selector::parse(finder.selector)
.map_err(|e| format!("Bad CSS selector: {:?}", e.kind))?;
let selected = document.select(&selector);

if captures.name("text").is_some() {
if finder.is_text {
Ok(selected.map(|element| element.text().collect()).collect())
} else if let Some(attr) = captures.name("attr") {
} else if let Some(attr) = finder.attr {
Ok(selected
.filter_map(|element| element.value().attr(attr.as_str()).map(|s| s.to_string()))
.filter_map(|element| element.value().attr(attr).map(|s| s.to_string()))
.collect())
} else {
Err("Unknown request".to_string())
Err("Unknown finder (not `{text}` or `attr{...}`".to_string())
}
}

fn parse(inputs: Inputs) -> Result<Vec<String>, String> {
let document = Html::parse_document(&inputs.html);
let re = Regex::new(r"(?P<selector>.+) (?:(?P<text>\{text\})|(attr\{(?P<attr>[^}]+)\}))$").unwrap();
match re.captures(&inputs.selector) {
Some(captures) => select(document, captures),
None => {
Err("Please specify {text} or attr{ATTRIBUTE}".to_string())
}
let re = Regex::new(r"(?P<selector>[^{}]+) (?:(?P<text>\{text\})|(attr\{(?P<attr>[^}]+)\}))[,]?\s*").unwrap();
let finders: Vec<Finder> = re.captures_iter(&inputs.selector)
.map(|c| Finder { selector: c.name("selector").unwrap().as_str(), is_text: c.name("text").is_some(), attr: c.name("attr").map(|a| a.as_str()) }).collect();

if finders.len() == 0 {
Err("Please specify {text} or attr{ATTRIBUTE}".to_string())
} else {
select_all(document, finders)
}
}

Expand Down Expand Up @@ -134,6 +150,19 @@ mod test {
assert_eq!(result, Ok(vec!("foo".to_string())));
}

#[test]
fn test_multiple_selectors(){
let html = r#"
<!DOCTYPE html>
<meta charset="utf-8">
<title>Hello, world!</title>
<h1 class="foo">Hello, <i>world!</i></h1>
"#;
let selector = "h1 attr{class}, h1 {text}";
let result = parse(build_inputs(html, selector));
assert_eq!(result, Ok(vec!("foo".to_string(), "Hello, world!".to_string())));
}

#[test]
fn test_no_text_or_attr_specification(){
let html = r#"
Expand Down

0 comments on commit 7ba8274

Please sign in to comment.