Merge pull request #68 from code-for-venezuela/luis/toy-webscrapper

Luis/toy webscrapper
code-for-venezuela · May 17, 2021 · 02e7935 · 02e7935
2 parents 5069e31 + 32ead25
commit 02e7935
Show file tree

Hide file tree

Showing 21 changed files with 10,173 additions and 869 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,8 @@ pandas = "1.0.5"
 nltk = "^3.5"
 google-cloud-bigquery = "1.25.0"
 tensorflow-cpu = "2.3.1"
+Scrapy = "^2.5.0"
+beautifulsoup4 = "^4.9.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "^5.4.3"

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-poetry==1.0.9
+poetry==1.1.6
 nox==2020.5.24
diff --git a/src/c4v/scraper/__init__.py b/src/c4v/scraper/__init__.py
diff --git a/src/c4v/scraper/scraper.py b/src/c4v/scraper/scraper.py
@@ -0,0 +1,80 @@
+"""
+    Main module interface
+"""
+
+# Local imports
+from c4v.scraper.scrapers.base_scraper import BaseScraper
+from .settings import URL_TO_SCRAPER
+from c4v.scraper.utils import get_domain_from_url, valid_url
+
+# Python imports
+from typing import List, Type, Dict, Any
+
+
+def scrape(url: str) -> Dict[str, Any]:
+    """
+        Scrape data for the given url if such url is scrappable,
+        Raise ValueError if not. 
+
+        Params:
+            + url - str : Url to scrape
+        Return:
+            A dict object, each describing the data that could be 
+            extracted for this url. Obtained data depends on the url itself, 
+            so available data may change depending on the scrapped url.
+            Dict format:
+             {
+                 "url" : (str) url where the data came from,
+                 "data": (dict) Data scraped for this url
+             }
+    """
+    scraper = _get_scraper_from_url(url)()
+    return scraper.scrape(url)
+
+
+def bulk_scrape(urls: List[str]) -> List[Dict[str, Any]]:
+    """
+        Performs a bulk scraping over a list of urls.
+        Order in the item list it's not guaranteed to be
+        the same as in the input list
+
+        Parameters:
+            + urls : [str] = Urls to be scraped
+        Return:
+            A list of items scraped for each url in the original list
+    """
+
+    items = []
+    scrapers = {}
+    for url in urls:
+        # Classify urls to its according scraper
+        scraper = _get_scraper_from_url(url)
+
+        if not (url_list := scrapers.get(scraper)):
+            url_list = scrapers[scraper] = []
+
+        url_list.append(url)
+
+    # Bulk scrape urls
+    for (scraper, url_list) in scrapers.items():
+        s = scraper()  # Create a new scraper instance
+        items.extend(s.bulk_scrape(url_list))
+
+    return items
+
+
+def _get_scraper_from_url(url: str) -> Type[BaseScraper]:
+    """
+        Validates if this url is scrapable and returns its 
+        corresponding spider when it is
+    """
+
+    if not valid_url(url):
+        raise ValueError(f"This is not a valid url: {url}")
+
+    domain = get_domain_from_url(url)
+
+    if not (scraper := URL_TO_SCRAPER.get(domain)):
+        raise ValueError(f"Unable to scrap this url: {url}")
+
+    return scraper
diff --git a/src/c4v/scraper/scrapers/__init__.py b/src/c4v/scraper/scrapers/__init__.py
diff --git a/src/c4v/scraper/scrapers/base_scraper.py b/src/c4v/scraper/scrapers/base_scraper.py
@@ -0,0 +1,60 @@
+"""
+    Base class for a scrapper.
+    In order to create and wire a new scrapper:
+        1) Create a new scraper in the "scrapers" directory
+        2) Make your scraper a subclass of BaseScraper
+        3) Implement missing methods (parse & scrape)
+        4) add an entry in settings.py to the URL_TO_SCRAPER map, maping from 
+           a domain name to your new scraper. Import it if necessary
+"""
+
+# Python imports
+from typing import List, Dict, Any
+
+
+class BaseScraper:
+    """
+        Base class for scrapers implementations
+    """
+
+    def parse(self, response) -> Dict[str, Any]:
+        """
+            return scraped data from a response object 
+            Parameters:
+                + response : any = some kind of structure holding an http response
+                                   from which we can scrape data
+            Return:
+                A dict with scrapped fields from response
+        """
+        pass
+
+    def scrape(self, url: str) -> Dict[str, Any]:
+        """
+            return scraped data from url.
+            Parameters: 
+                + url : str = url to be scraped by this class
+            Return:
+                A dict with scrapped data from the given url
+                if such url is a valid one
+        """
+        pass
+
+    def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
+        """
+            Return scraped data for a list of urls. Override it 
+            if your scraper implementation could handle an optimized
+            bulk scraping.
+
+            Parametes:
+                + urls : [str] = urls to be scraped
+            Return:
+                List of scraped items. Notice that the order it's not guaranteed to be
+                the same as in the input list.
+        """
+
+        items = []
+        for url in urls:
+            if (item := self.scrape(url)) :
+                items.append(item)
+
+        return items
diff --git a/src/c4v/scraper/scrapers/base_scrapy_scraper.py b/src/c4v/scraper/scrapers/base_scrapy_scraper.py
@@ -0,0 +1,50 @@
+"""
+    Base class for scrapy-based scrapers.
+
+    In order to create a a new scrapy scraper:
+        1) Create a new scraper un "scrapers" folder, and make it subclass
+            of this BaseScrapyScraper
+        2) override "spider" attribute of your new class with a valid
+            scrapy spider
+        3) wired it in settings as you would do with a regular scraper 
+"""
+
+# External imports
+from scrapy import Spider
+
+# Internal imports
+from c4v.scraper.scrapers.base_scraper import BaseScraper
+from c4v.scraper.spider_manager import SpiderManager
+
+# Python imports
+from typing import Type, List, Dict, Any
+
+
+class BaseScrapyScraper(BaseScraper):
+    """
+        In order to create a new Scrappy Scrapper, just 
+        inherit this class and assign a new value to the 
+        "spider" field, a valid scrapy Spider sub class.
+    """
+
+    spider: Type[Spider] = None
+
+    def __init__(self):
+
+        if self.spider is None:
+            raise TypeError(
+                "Spider not defined,"
+                + "perhaps you forgot to override spider"
+                + "attribute in BaseScrapyScraper subclass?"
+            )
+
+        self._spider_manager = SpiderManager(self.spider)
+
+    def parse(self, response) -> Dict[str, Any]:
+        return self._spider_manager.parse(response)
+
+    def scrape(self, url: str) -> Dict[str, Any]:
+        return self._spider_manager.scrape(url)
+
+    def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
+        return self._spider_manager.bulk_scrape(urls)
diff --git a/src/c4v/scraper/scrapers/el_pitazo_scraper.py b/src/c4v/scraper/scrapers/el_pitazo_scraper.py
@@ -0,0 +1,15 @@
+"""
+    Scraper to get data from El Pitazo
+"""
+# Internal imports
+from c4v.scraper.scrapers.base_scrapy_scraper import BaseScrapyScraper
+from c4v.scraper.spiders.el_pitazo import ElPitazoSpider
+
+
+class ElPitazoScraper(BaseScrapyScraper):
+    """
+        Scrapes data from ElPitazo, relies in 
+        scrapy for this.
+    """
+
+    spider = ElPitazoSpider
diff --git a/src/c4v/scraper/scrapy_settings.py b/src/c4v/scraper/scrapy_settings.py
@@ -0,0 +1,6 @@
+"""
+    Settings specific to scrapy
+"""
+
+# Settings passed to the crawler
+CRAWLER_SETTINGS = {"LOG_ENABLED": False}
diff --git a/src/c4v/scraper/settings.py b/src/c4v/scraper/settings.py
@@ -0,0 +1,17 @@
+"""
+    This file manages multiple settings shared across the scraper,
+    such as mappings from urls to scrapers
+"""
+from c4v.scraper.scrapers.el_pitazo_scraper import ElPitazoScraper
+import os
+
+
+# Dict with information to map from domain to
+# Spider
+URL_TO_SCRAPER = {
+    "elpitazo.net": ElPitazoScraper,
+}
+
+
+# root dir, so we can get resources from module directories
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/c4v/scraper/spider_manager.py b/src/c4v/scraper/spider_manager.py
@@ -0,0 +1,80 @@
+# external imports
+import scrapy
+from scrapy.crawler import CrawlerProcess
+import scrapy.signals
+
+# Project imports
+import c4v.scraper.scrapy_settings as settings
+
+# Python imports
+from typing import List, Dict, Any
+
+
+class SpiderManager:
+    """
+        Utility class to perform common operations in 
+        Spider classes
+    """
+
+    process = CrawlerProcess(settings.CRAWLER_SETTINGS)
+
+    def __init__(self, spider) -> None:
+        self.spider = spider
+
+    def parse(self, response) -> Dict[str, Any]:
+        """
+            return scraped data from a valid response
+            Parameters: 
+                + response : scrapy.http.Response = response object holding the actual response
+            Return:
+                dict like object with scraped data
+        """
+        spider = self.spider()
+        return spider.parse(response)
+
+    def scrape(self, url: str) -> Dict[str, Any]:
+        """
+            Return scraped data from a single Url
+            Parameters:
+                + url : str = url whose data is to be scraped. Should be compatible with the given spider
+            Return:
+                dict like object with scraped data
+        """
+        scraped = self.bulk_scrape([url])
+
+        return scraped[0] if scraped else None
+
+    def bulk_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
+        """
+            return scraped data from a list of valid URLs
+            Parameters:
+                + urls : [str] = urls whose data is to be scraped. 
+                                Should be compatible with the provided spider
+            Return:
+                list of dict like object with scraped data
+        """
+
+        # if nothing to do, just return an empty list
+        if not urls:
+            return []
+
+        # Items accumulator
+        items = []
+
+        # callback function to collect items on the fly
+        def items_scrapped(item, response, spider):
+            items.append({"url": response._url, "data": item})
+
+        # set up urls to scrape
+        self.spider.start_urls = urls
+
+        # create crawler for this spider, connect signal so we can collect items
+        crawler = self.process.create_crawler(self.spider)
+        crawler.signals.connect(items_scrapped, signal=scrapy.signals.item_scraped)
+
+        # start scrapping
+        self.process.crawl(crawler)
+        self.process.start()
+
+        # return post processed scrapped objects
+        return items
diff --git a/src/c4v/scraper/spiders/__init__.py b/src/c4v/scraper/spiders/__init__.py
diff --git a/src/c4v/scraper/spiders/el_pitazo.py b/src/c4v/scraper/spiders/el_pitazo.py
@@ -0,0 +1,71 @@
+# Internal imports
+import c4v.scraper.utils as utils
+
+# External imports
+import scrapy
+
+# Python imports
+from typing import List, Dict, Any
+
+
+class ElPitazoSpider(scrapy.Spider):
+    """
+        Spider to scrape ElPitazo data 
+    """
+
+    name = "el_pitazo"
+
+    start_urls = []
+
+    def parse(self, response) -> Dict[str, Any]:
+        """
+            Returns a dict like structure with the following 
+            fields:
+                + title
+                + date
+                + categories
+                + body
+                + author 
+                + tags
+        """
+
+        # These are simple properties, just get its text with a valid
+        # selector
+        title = response.css(".tdb-title-text ::text").get() or ""
+        date =  response.css(".entry-date ::text").get() or ""
+        author = response.css(".tdb-author-name ::text").get() or ""
+
+        body = self._get_body(response)
+
+        tags = self._get_tags(response)
+
+        # categories
+        categories = response.css(".tdb-entry-category ::text").getall()
+
+        return {
+            "title": title,
+            "date": date,
+            "categories": categories,
+            "body": body,
+            "author": author,
+            "tags": tags,
+        }
+
+    def _get_body(self, response) -> str:
+        """
+            Get article body as a single string
+        """
+        body = response.css("#bsf_rt_marker > p").getall()
+        body = filter(lambda p: p.startswith("<p>") and p.endswith("</p>"), body)
+        body = map(utils.strip_http_tags, body)
+
+        body = "\n".join(body)
+
+        return body.strip()
+
+    def _get_tags(self, response) -> List[str]:
+        """
+            Try to get tags from document if available
+        """
+        tags = response.css(".tdb-tags > li > a ::text").getall()
+        return tags