From 551a803a6392db76a735b448974fba6c29fd0672 Mon Sep 17 00:00:00 2001 From: malteneuss Date: Sat, 20 Jul 2024 23:02:06 +0200 Subject: [PATCH 1/3] Improve introduction example to be copy-pastable. --- README.md | 153 +++++++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index b5733bf..c75280a 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,90 @@ Scalpel [![Build status](https://github.com/fimad/scalpel/actions/workflows/stack.yml/badge.svg)](https://github.com/fimad/scalpel/actions/workflows/stack.yml) [![Hackage](https://img.shields.io/hackage/v/scalpel.svg)](https://hackage.haskell.org/package/scalpel) ======= -Scalpel is a web scraping library inspired by libraries like +Scalpel is a convenient web scraping library to extract data from Html webpages. +It's inspired by libraries like [Parsec](http://hackage.haskell.org/package/parsec-3.1.7/docs/Text-Parsec.html) -and Perl's [Web::Scraper](http://search.cpan.org/~miyagawa/Web-Scraper-0.38/). -Scalpel builds on top of [TagSoup](http://hackage.haskell.org/package/tagsoup) -to provide a declarative and monadic interface. +and Perl's [Web::Scraper](http://search.cpan.org/~miyagawa/Web-Scraper-0.38/), +and provides a declarative, monadic interface on top of the robust +HTML parsing library [TagSoup](http://hackage.haskell.org/package/tagsoup) -There are two general mechanisms provided by this library that are used to build -web scrapers: Selectors and Scrapers. +Quickstart +---------- + +```haskell +{-# LANGUAGE OverloadedStrings #-} + +import Control.Applicative ((<|>)) +import Text.HTML.Scalpel + +htmlString :: String +htmlString = + "\ + \ \ + \
\ + \
\ + \ Sally\ + \
Woo hoo!
\ + \
\ + \
\ + \ Bill\ + \ \ + \
\ + \
\ + \ \ + \" + +main :: IO () +main = do + -- We can either scrape a raw html of any StringLike type (fetched before by other means): + let scrapedCommentsFromString = scrapeStringLike htmlString comments + -- prints: Just [TextComment "Sally" "Woo hoo!",ImageComment "Bill" "http://example.com/cat.gif"] + print scrapedCommentsFromString + + -- or let Scalpel fetch and scrape an HTML page for us for convenience : + scrapedCommentsFromUrl <- scrapeURL "http://example.org/article.html" comments + -- example.org doesn't have the HTML above + -- prints: Just [] + print scrapedCommentsFromUrl + +type Author = String + +data Comment + = TextComment Author String + | ImageComment Author URL + deriving (Show, Eq) + +comments :: Scraper String [Comment] +comments = chroots ("div" @: [hasClass "container"]) comment + where + comment :: Scraper String Comment + comment = textComment <|> imageComment + + textComment :: Scraper String Comment + textComment = do + author <- text $ "span" @: [hasClass "author"] + commentText <- text $ "div" @: [hasClass "text"] + return $ TextComment author commentText + + imageComment :: Scraper String Comment + imageComment = do + author <- text $ "span" @: [hasClass "author"] + imageURL <- attr "src" $ "img" @: [hasClass "image"] + return $ ImageComment author imageURL +``` + +This example demonstrates the most important features of this library: +You can parse and extract data from raw HTML text or from a webpage +by providing an URL; here we use a hypothetical HTML located at +`"http://example.com/article.html"` to extract a list of all +of the comments. + +More examples can be found in the +[examples](https://github.com/fimad/scalpel/tree/master/examples) folder in the +Scalpel git repository. + +To understand the code it's important to know that this this library provides +two main building blocks to build web scrapers: Selectors and Scrapers. Selectors --------- @@ -45,71 +121,6 @@ from the DOM. Each primitive defined by this library comes in two variants: singular and plural. The singular variants extract the first instance matching the given selector, while the plural variants match every instance. -Example -------- - -Complete examples can be found in the -[examples](https://github.com/fimad/scalpel/tree/master/examples) folder in the -scalpel git repository. - -The following is an example that demonstrates most of the features provided by -this library. Supposed you have the following hypothetical HTML located at -`"http://example.com/article.html"` and you would like to extract a list of all -of the comments. - -```html - - -
-
- Sally -
Woo hoo!
-
-
- Bill - -
-
- Susan -
WTF!?!
-
-
- - -``` - -The following snippet defines a function, `allComments`, that will download -the web page, and extract all of the comments into a list: - -```haskell -type Author = String - -data Comment - = TextComment Author String - | ImageComment Author URL - deriving (Show, Eq) - -allComments :: IO (Maybe [Comment]) -allComments = scrapeURL "http://example.com/article.html" comments - where - comments :: Scraper String [Comment] - comments = chroots ("div" @: [hasClass "container"]) comment - - comment :: Scraper String Comment - comment = textComment <|> imageComment - - textComment :: Scraper String Comment - textComment = do - author <- text $ "span" @: [hasClass "author"] - commentText <- text $ "div" @: [hasClass "text"] - return $ TextComment author commentText - - imageComment :: Scraper String Comment - imageComment = do - author <- text $ "span" @: [hasClass "author"] - imageURL <- attr "src" $ "img" @: [hasClass "image"] - return $ ImageComment author imageURL -``` Tips & Tricks ------------- From b7e7008ea33b7a2099e08db012db032c0a6c9a0a Mon Sep 17 00:00:00 2001 From: malteneuss Date: Sat, 20 Jul 2024 23:07:24 +0200 Subject: [PATCH 2/3] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c75280a..147c18d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Scalpel [![Build status](https://github.com/fimad/scalpel/actions/workflows/stack.yml/badge.svg)](https://github.com/fimad/scalpel/actions/workflows/stack.yml) [![Hackage](https://img.shields.io/hackage/v/scalpel.svg)](https://hackage.haskell.org/package/scalpel) ======= -Scalpel is a convenient web scraping library to extract data from Html webpages. +Scalpel is a convenient web scraping library to extract data from HTML webpages. It's inspired by libraries like [Parsec](http://hackage.haskell.org/package/parsec-3.1.7/docs/Text-Parsec.html) and Perl's [Web::Scraper](http://search.cpan.org/~miyagawa/Web-Scraper-0.38/), From f6e57bf6419635e3f1e75b3294487255e1914a6a Mon Sep 17 00:00:00 2001 From: malteneuss Date: Sat, 20 Jul 2024 23:14:37 +0200 Subject: [PATCH 3/3] Align examples folder example with Readme --- examples/example-from-documentation/Main.hs | 70 +++++++++++---------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/examples/example-from-documentation/Main.hs b/examples/example-from-documentation/Main.hs index 57067d6..e82353d 100644 --- a/examples/example-from-documentation/Main.hs +++ b/examples/example-from-documentation/Main.hs @@ -1,28 +1,37 @@ {-# LANGUAGE OverloadedStrings #-} +import Control.Applicative ((<|>)) import Text.HTML.Scalpel -import Control.Applicative - - -exampleHtml :: String -exampleHtml = "\ -\ \ -\
\ -\
\ -\ Sally\ -\
Woo hoo!
\ -\
\ -\
\ -\ Bill\ -\ \ -\
\ -\
\ -\ Susan\ -\
WTF!?!
\ -\
\ -\
\ -\ \ -\" + +htmlString :: String +htmlString = + "\ + \ \ + \
\ + \
\ + \ Sally\ + \
Woo hoo!
\ + \
\ + \
\ + \ Bill\ + \ \ + \
\ + \
\ + \ \ + \" + +main :: IO () +main = do + -- We can either scrape a raw html of any StringLike type (fetched before by other means): + let scrapedCommentsFromString = scrapeStringLike htmlString comments + -- prints: Just [TextComment "Sally" "Woo hoo!",ImageComment "Bill" "http://example.com/cat.gif"] + print scrapedCommentsFromString + + -- or let Scalpel fetch and scrape an HTML page for us for convenience : + scrapedCommentsFromUrl <- scrapeURL "http://example.org/article.html" comments + -- example.org doesn't have the HTML above + -- prints: Just [] + print scrapedCommentsFromUrl type Author = String @@ -31,23 +40,20 @@ data Comment | ImageComment Author URL deriving (Show, Eq) -main :: IO () -main = print $ scrapeStringLike exampleHtml comments - where - comments :: Scraper String [Comment] - comments = chroots ("div" @: [hasClass "container"]) comment - +comments :: Scraper String [Comment] +comments = chroots ("div" @: [hasClass "container"]) comment + where comment :: Scraper String Comment comment = textComment <|> imageComment textComment :: Scraper String Comment textComment = do - author <- text $ "span" @: [hasClass "author"] - commentText <- text $ "div" @: [hasClass "text"] + author <- text $ "span" @: [hasClass "author"] + commentText <- text $ "div" @: [hasClass "text"] return $ TextComment author commentText imageComment :: Scraper String Comment imageComment = do - author <- text $ "span" @: [hasClass "author"] - imageURL <- attr "src" $ "img" @: [hasClass "image"] + author <- text $ "span" @: [hasClass "author"] + imageURL <- attr "src" $ "img" @: [hasClass "image"] return $ ImageComment author imageURL