Skip to content

Commit

Permalink
Merge pull request #68 from code-for-venezuela/luis/toy-webscrapper
Browse files Browse the repository at this point in the history
Luis/toy webscrapper
  • Loading branch information
LDiazN authored May 17, 2021
2 parents 5069e31 + 32ead25 commit 02e7935
Show file tree
Hide file tree
Showing 21 changed files with 10,173 additions and 869 deletions.
2,445 changes: 1,577 additions & 868 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ pandas = "1.0.5"
nltk = "^3.5"
google-cloud-bigquery = "1.25.0"
tensorflow-cpu = "2.3.1"
Scrapy = "^2.5.0"
beautifulsoup4 = "^4.9.3"

[tool.poetry.dev-dependencies]
pytest = "^5.4.3"
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
poetry==1.0.9
poetry==1.1.6
nox==2020.5.24
Empty file added src/c4v/scraper/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions src/c4v/scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Main module interface
"""

# Local imports
from c4v.scraper.scrapers.base_scraper import BaseScraper
from .settings import URL_TO_SCRAPER
from c4v.scraper.utils import get_domain_from_url, valid_url

# Python imports
from typing import List, Type, Dict, Any


def scrape(url: str) -> Dict[str, Any]:
"""
Scrape data for the given url if such url is scrappable,
Raise ValueError if not.
Params:
+ url - str : Url to scrape
Return:
A dict object, each describing the data that could be
extracted for this url. Obtained data depends on the url itself,
so available data may change depending on the scrapped url.
Dict format:
{
"url" : (str) url where the data came from,
"data": (dict) Data scraped for this url
}
"""
scraper = _get_scraper_from_url(url)()
return scraper.scrape(url)


def bulk_scrape(urls: List[str]) -> List[Dict[str, Any]]:
"""
Performs a bulk scraping over a list of urls.
Order in the item list it's not guaranteed to be
the same as in the input list
Parameters:
+ urls : [str] = Urls to be scraped
Return:
A list of items scraped for each url in the original list
"""

items = []
scrapers = {}
for url in urls:
# Classify urls to its according scraper
scraper = _get_scraper_from_url(url)

if not (url_list := scrapers.get(scraper)):
url_list = scrapers[scraper] = []

url_list.append(url)

# Bulk scrape urls
for (scraper, url_list) in scrapers.items():
s = scraper() # Create a new scraper instance
items.extend(s.bulk_scrape(url_list))

return items


def _get_scraper_from_url(url: str) -> Type[BaseScraper]:
"""
Validates if this url is scrapable and returns its
corresponding spider when it is
"""

if not valid_url(url):
raise ValueError(f"This is not a valid url: {url}")

domain = get_domain_from_url(url)

if not (scraper := URL_TO_SCRAPER.get(domain)):
raise ValueError(f"Unable to scrap this url: {url}")

return scraper
Empty file.
60 changes: 60 additions & 0 deletions src/c4v/scraper/scrapers/base_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Base class for a scrapper.
In order to create and wire a new scrapper:
1) Create a new scraper in the "scrapers" directory
2) Make your scraper a subclass of BaseScraper
3) Implement missing methods (parse & scrape)
4) add an entry in settings.py to the URL_TO_SCRAPER map, maping from
a domain name to your new scraper. Import it if necessary
"""

# Python imports
from typing import List, Dict, Any


class BaseScraper:
"""
Base class for scrapers implementations
"""

def parse(self, response) -> Dict[str, Any]:
"""
return scraped data from a response object
Parameters:
+ response : any = some kind of structure holding an http response
from which we can scrape data
Return:
A dict with scrapped fields from response
"""
pass

def scrape(self, url: str) -> Dict[str, Any]:
"""
return scraped data from url.
Parameters:
+ url : str = url to be scraped by this class
Return:
A dict with scrapped data from the given url
if such url is a valid one
"""
pass

def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
"""
Return scraped data for a list of urls. Override it
if your scraper implementation could handle an optimized
bulk scraping.
Parametes:
+ urls : [str] = urls to be scraped
Return:
List of scraped items. Notice that the order it's not guaranteed to be
the same as in the input list.
"""

items = []
for url in urls:
if (item := self.scrape(url)) :
items.append(item)

return items
50 changes: 50 additions & 0 deletions src/c4v/scraper/scrapers/base_scrapy_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Base class for scrapy-based scrapers.
In order to create a a new scrapy scraper:
1) Create a new scraper un "scrapers" folder, and make it subclass
of this BaseScrapyScraper
2) override "spider" attribute of your new class with a valid
scrapy spider
3) wired it in settings as you would do with a regular scraper
"""

# External imports
from scrapy import Spider

# Internal imports
from c4v.scraper.scrapers.base_scraper import BaseScraper
from c4v.scraper.spider_manager import SpiderManager

# Python imports
from typing import Type, List, Dict, Any


class BaseScrapyScraper(BaseScraper):
"""
In order to create a new Scrappy Scrapper, just
inherit this class and assign a new value to the
"spider" field, a valid scrapy Spider sub class.
"""

spider: Type[Spider] = None

def __init__(self):

if self.spider is None:
raise TypeError(
"Spider not defined,"
+ "perhaps you forgot to override spider"
+ "attribute in BaseScrapyScraper subclass?"
)

self._spider_manager = SpiderManager(self.spider)

def parse(self, response) -> Dict[str, Any]:
return self._spider_manager.parse(response)

def scrape(self, url: str) -> Dict[str, Any]:
return self._spider_manager.scrape(url)

def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
return self._spider_manager.bulk_scrape(urls)
15 changes: 15 additions & 0 deletions src/c4v/scraper/scrapers/el_pitazo_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Scraper to get data from El Pitazo
"""
# Internal imports
from c4v.scraper.scrapers.base_scrapy_scraper import BaseScrapyScraper
from c4v.scraper.spiders.el_pitazo import ElPitazoSpider


class ElPitazoScraper(BaseScrapyScraper):
"""
Scrapes data from ElPitazo, relies in
scrapy for this.
"""

spider = ElPitazoSpider
6 changes: 6 additions & 0 deletions src/c4v/scraper/scrapy_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
Settings specific to scrapy
"""

# Settings passed to the crawler
CRAWLER_SETTINGS = {"LOG_ENABLED": False}
17 changes: 17 additions & 0 deletions src/c4v/scraper/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
This file manages multiple settings shared across the scraper,
such as mappings from urls to scrapers
"""
from c4v.scraper.scrapers.el_pitazo_scraper import ElPitazoScraper
import os


# Dict with information to map from domain to
# Spider
URL_TO_SCRAPER = {
"elpitazo.net": ElPitazoScraper,
}


# root dir, so we can get resources from module directories
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
80 changes: 80 additions & 0 deletions src/c4v/scraper/spider_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# external imports
import scrapy
from scrapy.crawler import CrawlerProcess
import scrapy.signals

# Project imports
import c4v.scraper.scrapy_settings as settings

# Python imports
from typing import List, Dict, Any


class SpiderManager:
"""
Utility class to perform common operations in
Spider classes
"""

process = CrawlerProcess(settings.CRAWLER_SETTINGS)

def __init__(self, spider) -> None:
self.spider = spider

def parse(self, response) -> Dict[str, Any]:
"""
return scraped data from a valid response
Parameters:
+ response : scrapy.http.Response = response object holding the actual response
Return:
dict like object with scraped data
"""
spider = self.spider()
return spider.parse(response)

def scrape(self, url: str) -> Dict[str, Any]:
"""
Return scraped data from a single Url
Parameters:
+ url : str = url whose data is to be scraped. Should be compatible with the given spider
Return:
dict like object with scraped data
"""
scraped = self.bulk_scrape([url])

return scraped[0] if scraped else None

def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
"""
return scraped data from a list of valid URLs
Parameters:
+ urls : [str] = urls whose data is to be scraped.
Should be compatible with the provided spider
Return:
list of dict like object with scraped data
"""

# if nothing to do, just return an empty list
if not urls:
return []

# Items accumulator
items = []

# callback function to collect items on the fly
def items_scrapped(item, response, spider):
items.append({"url": response._url, "data": item})

# set up urls to scrape
self.spider.start_urls = urls

# create crawler for this spider, connect signal so we can collect items
crawler = self.process.create_crawler(self.spider)
crawler.signals.connect(items_scrapped, signal=scrapy.signals.item_scraped)

# start scrapping
self.process.crawl(crawler)
self.process.start()

# return post processed scrapped objects
return items
Empty file.
71 changes: 71 additions & 0 deletions src/c4v/scraper/spiders/el_pitazo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Internal imports
import c4v.scraper.utils as utils

# External imports
import scrapy

# Python imports
from typing import List, Dict, Any


class ElPitazoSpider(scrapy.Spider):
"""
Spider to scrape ElPitazo data
"""

name = "el_pitazo"

start_urls = []

def parse(self, response) -> Dict[str, Any]:
"""
Returns a dict like structure with the following
fields:
+ title
+ date
+ categories
+ body
+ author
+ tags
"""

# These are simple properties, just get its text with a valid
# selector
title = response.css(".tdb-title-text ::text").get() or ""
date = response.css(".entry-date ::text").get() or ""
author = response.css(".tdb-author-name ::text").get() or ""

body = self._get_body(response)

tags = self._get_tags(response)

# categories
categories = response.css(".tdb-entry-category ::text").getall()

return {
"title": title,
"date": date,
"categories": categories,
"body": body,
"author": author,
"tags": tags,
}

def _get_body(self, response) -> str:
"""
Get article body as a single string
"""
body = response.css("#bsf_rt_marker > p").getall()
body = filter(lambda p: p.startswith("<p>") and p.endswith("</p>"), body)
body = map(utils.strip_http_tags, body)

body = "\n".join(body)

return body.strip()

def _get_tags(self, response) -> List[str]:
"""
Try to get tags from document if available
"""
tags = response.css(".tdb-tags > li > a ::text").getall()
return tags
Loading

0 comments on commit 02e7935

Please sign in to comment.