-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #68 from code-for-venezuela/luis/toy-webscrapper
Luis/toy webscrapper
- Loading branch information
Showing
21 changed files
with
10,173 additions
and
869 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
poetry==1.0.9 | ||
poetry==1.1.6 | ||
nox==2020.5.24 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Main module interface | ||
""" | ||
|
||
# Local imports | ||
from c4v.scraper.scrapers.base_scraper import BaseScraper | ||
from .settings import URL_TO_SCRAPER | ||
from c4v.scraper.utils import get_domain_from_url, valid_url | ||
|
||
# Python imports | ||
from typing import List, Type, Dict, Any | ||
|
||
|
||
def scrape(url: str) -> Dict[str, Any]: | ||
""" | ||
Scrape data for the given url if such url is scrappable, | ||
Raise ValueError if not. | ||
Params: | ||
+ url - str : Url to scrape | ||
Return: | ||
A dict object, each describing the data that could be | ||
extracted for this url. Obtained data depends on the url itself, | ||
so available data may change depending on the scrapped url. | ||
Dict format: | ||
{ | ||
"url" : (str) url where the data came from, | ||
"data": (dict) Data scraped for this url | ||
} | ||
""" | ||
scraper = _get_scraper_from_url(url)() | ||
return scraper.scrape(url) | ||
|
||
|
||
def bulk_scrape(urls: List[str]) -> List[Dict[str, Any]]: | ||
""" | ||
Performs a bulk scraping over a list of urls. | ||
Order in the item list it's not guaranteed to be | ||
the same as in the input list | ||
Parameters: | ||
+ urls : [str] = Urls to be scraped | ||
Return: | ||
A list of items scraped for each url in the original list | ||
""" | ||
|
||
items = [] | ||
scrapers = {} | ||
for url in urls: | ||
# Classify urls to its according scraper | ||
scraper = _get_scraper_from_url(url) | ||
|
||
if not (url_list := scrapers.get(scraper)): | ||
url_list = scrapers[scraper] = [] | ||
|
||
url_list.append(url) | ||
|
||
# Bulk scrape urls | ||
for (scraper, url_list) in scrapers.items(): | ||
s = scraper() # Create a new scraper instance | ||
items.extend(s.bulk_scrape(url_list)) | ||
|
||
return items | ||
|
||
|
||
def _get_scraper_from_url(url: str) -> Type[BaseScraper]: | ||
""" | ||
Validates if this url is scrapable and returns its | ||
corresponding spider when it is | ||
""" | ||
|
||
if not valid_url(url): | ||
raise ValueError(f"This is not a valid url: {url}") | ||
|
||
domain = get_domain_from_url(url) | ||
|
||
if not (scraper := URL_TO_SCRAPER.get(domain)): | ||
raise ValueError(f"Unable to scrap this url: {url}") | ||
|
||
return scraper |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
""" | ||
Base class for a scrapper. | ||
In order to create and wire a new scrapper: | ||
1) Create a new scraper in the "scrapers" directory | ||
2) Make your scraper a subclass of BaseScraper | ||
3) Implement missing methods (parse & scrape) | ||
4) add an entry in settings.py to the URL_TO_SCRAPER map, maping from | ||
a domain name to your new scraper. Import it if necessary | ||
""" | ||
|
||
# Python imports | ||
from typing import List, Dict, Any | ||
|
||
|
||
class BaseScraper: | ||
""" | ||
Base class for scrapers implementations | ||
""" | ||
|
||
def parse(self, response) -> Dict[str, Any]: | ||
""" | ||
return scraped data from a response object | ||
Parameters: | ||
+ response : any = some kind of structure holding an http response | ||
from which we can scrape data | ||
Return: | ||
A dict with scrapped fields from response | ||
""" | ||
pass | ||
|
||
def scrape(self, url: str) -> Dict[str, Any]: | ||
""" | ||
return scraped data from url. | ||
Parameters: | ||
+ url : str = url to be scraped by this class | ||
Return: | ||
A dict with scrapped data from the given url | ||
if such url is a valid one | ||
""" | ||
pass | ||
|
||
def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]: | ||
""" | ||
Return scraped data for a list of urls. Override it | ||
if your scraper implementation could handle an optimized | ||
bulk scraping. | ||
Parametes: | ||
+ urls : [str] = urls to be scraped | ||
Return: | ||
List of scraped items. Notice that the order it's not guaranteed to be | ||
the same as in the input list. | ||
""" | ||
|
||
items = [] | ||
for url in urls: | ||
if (item := self.scrape(url)) : | ||
items.append(item) | ||
|
||
return items |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
Base class for scrapy-based scrapers. | ||
In order to create a a new scrapy scraper: | ||
1) Create a new scraper un "scrapers" folder, and make it subclass | ||
of this BaseScrapyScraper | ||
2) override "spider" attribute of your new class with a valid | ||
scrapy spider | ||
3) wired it in settings as you would do with a regular scraper | ||
""" | ||
|
||
# External imports | ||
from scrapy import Spider | ||
|
||
# Internal imports | ||
from c4v.scraper.scrapers.base_scraper import BaseScraper | ||
from c4v.scraper.spider_manager import SpiderManager | ||
|
||
# Python imports | ||
from typing import Type, List, Dict, Any | ||
|
||
|
||
class BaseScrapyScraper(BaseScraper): | ||
""" | ||
In order to create a new Scrappy Scrapper, just | ||
inherit this class and assign a new value to the | ||
"spider" field, a valid scrapy Spider sub class. | ||
""" | ||
|
||
spider: Type[Spider] = None | ||
|
||
def __init__(self): | ||
|
||
if self.spider is None: | ||
raise TypeError( | ||
"Spider not defined," | ||
+ "perhaps you forgot to override spider" | ||
+ "attribute in BaseScrapyScraper subclass?" | ||
) | ||
|
||
self._spider_manager = SpiderManager(self.spider) | ||
|
||
def parse(self, response) -> Dict[str, Any]: | ||
return self._spider_manager.parse(response) | ||
|
||
def scrape(self, url: str) -> Dict[str, Any]: | ||
return self._spider_manager.scrape(url) | ||
|
||
def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]: | ||
return self._spider_manager.bulk_scrape(urls) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
""" | ||
Scraper to get data from El Pitazo | ||
""" | ||
# Internal imports | ||
from c4v.scraper.scrapers.base_scrapy_scraper import BaseScrapyScraper | ||
from c4v.scraper.spiders.el_pitazo import ElPitazoSpider | ||
|
||
|
||
class ElPitazoScraper(BaseScrapyScraper): | ||
""" | ||
Scrapes data from ElPitazo, relies in | ||
scrapy for this. | ||
""" | ||
|
||
spider = ElPitazoSpider |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
""" | ||
Settings specific to scrapy | ||
""" | ||
|
||
# Settings passed to the crawler | ||
CRAWLER_SETTINGS = {"LOG_ENABLED": False} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
""" | ||
This file manages multiple settings shared across the scraper, | ||
such as mappings from urls to scrapers | ||
""" | ||
from c4v.scraper.scrapers.el_pitazo_scraper import ElPitazoScraper | ||
import os | ||
|
||
|
||
# Dict with information to map from domain to | ||
# Spider | ||
URL_TO_SCRAPER = { | ||
"elpitazo.net": ElPitazoScraper, | ||
} | ||
|
||
|
||
# root dir, so we can get resources from module directories | ||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# external imports | ||
import scrapy | ||
from scrapy.crawler import CrawlerProcess | ||
import scrapy.signals | ||
|
||
# Project imports | ||
import c4v.scraper.scrapy_settings as settings | ||
|
||
# Python imports | ||
from typing import List, Dict, Any | ||
|
||
|
||
class SpiderManager: | ||
""" | ||
Utility class to perform common operations in | ||
Spider classes | ||
""" | ||
|
||
process = CrawlerProcess(settings.CRAWLER_SETTINGS) | ||
|
||
def __init__(self, spider) -> None: | ||
self.spider = spider | ||
|
||
def parse(self, response) -> Dict[str, Any]: | ||
""" | ||
return scraped data from a valid response | ||
Parameters: | ||
+ response : scrapy.http.Response = response object holding the actual response | ||
Return: | ||
dict like object with scraped data | ||
""" | ||
spider = self.spider() | ||
return spider.parse(response) | ||
|
||
def scrape(self, url: str) -> Dict[str, Any]: | ||
""" | ||
Return scraped data from a single Url | ||
Parameters: | ||
+ url : str = url whose data is to be scraped. Should be compatible with the given spider | ||
Return: | ||
dict like object with scraped data | ||
""" | ||
scraped = self.bulk_scrape([url]) | ||
|
||
return scraped[0] if scraped else None | ||
|
||
def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]: | ||
""" | ||
return scraped data from a list of valid URLs | ||
Parameters: | ||
+ urls : [str] = urls whose data is to be scraped. | ||
Should be compatible with the provided spider | ||
Return: | ||
list of dict like object with scraped data | ||
""" | ||
|
||
# if nothing to do, just return an empty list | ||
if not urls: | ||
return [] | ||
|
||
# Items accumulator | ||
items = [] | ||
|
||
# callback function to collect items on the fly | ||
def items_scrapped(item, response, spider): | ||
items.append({"url": response._url, "data": item}) | ||
|
||
# set up urls to scrape | ||
self.spider.start_urls = urls | ||
|
||
# create crawler for this spider, connect signal so we can collect items | ||
crawler = self.process.create_crawler(self.spider) | ||
crawler.signals.connect(items_scrapped, signal=scrapy.signals.item_scraped) | ||
|
||
# start scrapping | ||
self.process.crawl(crawler) | ||
self.process.start() | ||
|
||
# return post processed scrapped objects | ||
return items |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Internal imports | ||
import c4v.scraper.utils as utils | ||
|
||
# External imports | ||
import scrapy | ||
|
||
# Python imports | ||
from typing import List, Dict, Any | ||
|
||
|
||
class ElPitazoSpider(scrapy.Spider): | ||
""" | ||
Spider to scrape ElPitazo data | ||
""" | ||
|
||
name = "el_pitazo" | ||
|
||
start_urls = [] | ||
|
||
def parse(self, response) -> Dict[str, Any]: | ||
""" | ||
Returns a dict like structure with the following | ||
fields: | ||
+ title | ||
+ date | ||
+ categories | ||
+ body | ||
+ author | ||
+ tags | ||
""" | ||
|
||
# These are simple properties, just get its text with a valid | ||
# selector | ||
title = response.css(".tdb-title-text ::text").get() or "" | ||
date = response.css(".entry-date ::text").get() or "" | ||
author = response.css(".tdb-author-name ::text").get() or "" | ||
|
||
body = self._get_body(response) | ||
|
||
tags = self._get_tags(response) | ||
|
||
# categories | ||
categories = response.css(".tdb-entry-category ::text").getall() | ||
|
||
return { | ||
"title": title, | ||
"date": date, | ||
"categories": categories, | ||
"body": body, | ||
"author": author, | ||
"tags": tags, | ||
} | ||
|
||
def _get_body(self, response) -> str: | ||
""" | ||
Get article body as a single string | ||
""" | ||
body = response.css("#bsf_rt_marker > p").getall() | ||
body = filter(lambda p: p.startswith("<p>") and p.endswith("</p>"), body) | ||
body = map(utils.strip_http_tags, body) | ||
|
||
body = "\n".join(body) | ||
|
||
return body.strip() | ||
|
||
def _get_tags(self, response) -> List[str]: | ||
""" | ||
Try to get tags from document if available | ||
""" | ||
tags = response.css(".tdb-tags > li > a ::text").getall() | ||
return tags |
Oops, something went wrong.