Add scraper for FECAMRN's website (#156)

augusto-herrmann · Mar 6, 2023 · 2623eda · 2623eda
1 parent a602b27
commit 2623eda
Show file tree

Hide file tree

Showing 4 changed files with 319 additions and 0 deletions.
diff --git a/sources/fecamrn/README.md b/sources/fecamrn/README.md
@@ -0,0 +1,8 @@
+The
+[Federation of City Councils of the State of Rio Grande do Norte](https://fecamrn.com.br/)
+hosts some transparency portals of city councils in the state and has
+link to the others.
+
+All of them are
+[on this list](https://fecamrn.com.br/transparencias-das-camaras-municipais).
+The data source harvester scrapes the links off that page.
diff --git a/tools/harvest/fecamrn/README.md b/tools/harvest/fecamrn/README.md
@@ -0,0 +1,20 @@
+# FECAMRN import scripts
+
+These scripts import data from the FECAMRN data source. For more
+information on the rationale and process, see the
+[source description here](../../../sources/fecammrn/README.md).
+
+## Usage
+
+1. Create a Python virtual environment. This is not required, but it is
+   recommended.
+2. Install the dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+3. Run the script:
+   ```
+   python tools/harvest/fecamrn/fecammrn.py
+   ```
+
+Note: Python 3 is required for this script.
diff --git a/tools/harvest/fecamrn/fecamrn.py b/tools/harvest/fecamrn/fecamrn.py
@@ -0,0 +1,79 @@
+import os
+import argparse
+
+from bs4 import BeautifulSoup
+
+from harvest.scrapers import TransparencyPortalScraper
+
+
+class FECAMRNScraper(TransparencyPortalScraper):
+    source_url: str = "https://fecamrn.com.br/transparencias-das-camaras-municipais"
+
+    def parse(self):
+        """Parse list of city councils webpage."""
+        soup = BeautifulSoup(self.web_content, "html.parser")
+        links = [
+            (p.text.title(), p.find_next("a")["href"])
+            for p in soup.find("div", {"class": "texto"}).find_all("p")
+            if p.text
+        ]
+        for name, url in links:
+            self.append(
+                state_code="RN",
+                municipality=name,
+                sphere="municipal",
+                branch="legislative",
+                url=url,
+                type="SPT",
+            )
+
+
+def extract_fecamrn_portals(**kwargs):
+    """Extract city council transparency portals by scraping FECAMRN's
+    website.
+
+    Args:
+        output_folder (str): Path to write the output to.
+        source_url (str): URL to FECAMRN's website.
+    """
+    scraper = FECAMRNScraper(**kwargs)
+    scraper.harvest()
+    scraper.resource.data = scraper.fill_municipal_codes(scraper.dataframe)
+    scraper.save()
+
+
+def parse_cli() -> dict:
+    """Parses the command line interface.
+
+    Returns:
+        dict: A dict containing the values for data_package_path, url.
+    """
+    parser = argparse.ArgumentParser(
+        description="""Scrapes candidate URLs for council portals """
+        """from FECAMRN's website."""
+    )
+    parser.add_argument(
+        "output",
+        help=("path to write the extracted csv to"),
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "url",
+        help=("URL for the FECAMRN website"),
+        default="",
+        nargs="?",
+    )
+    params = {}
+    args = parser.parse_args()
+    if args.output:
+        params["output_folder"] = args.output
+    if args.output and not os.path.exists(args.output):
+        raise FileNotFoundError(f"Folder not found: {args.output}")
+    params["source_url"] = args.url if args.url else FECAMRNScraper.source_url
+    return params
+
+
+if __name__ == "__main__":
+    options = parse_cli()
+    extract_fecamrn_portals(**options)
diff --git a/tools/harvest/scrapers.py b/tools/harvest/scrapers.py
@@ -0,0 +1,212 @@
+from abc import ABC, abstractmethod
+import os
+import unicodedata
+
+import requests
+import pandas as pd
+from frictionless import Package, Resource, Schema
+
+from settings import USER_AGENT, DEFAULT_TIMEOUT as TIMEOUT
+
+
+def remove_accents(text: str) -> str:
+    """Remove accents from text.
+
+    Args:
+        text (str): The text to remove accents from.
+
+    Returns:
+        str: The text without accents.
+    """
+    return "".join(
+        char
+        for char in unicodedata.normalize("NFD", text)
+        if not unicodedata.combining(char)
+    )
+
+
+class Harvester(ABC):
+    """Base class for harvesting data sources."""
+
+    output_folder: str
+    output_file: str
+    schema: Schema
+    title: str = None
+    description: str = None
+
+    def __init__(
+        self,
+        title: str = None,
+        description: str = None,
+        schema: Schema = None,
+        output_folder: str = "data/unverified",
+    ):
+        if title:
+            self.title = title
+        if description:
+            self.description = description
+        if schema:
+            self.schema = schema
+        if self.schema:
+            self.resource = Resource(
+                pd.DataFrame(columns=[field.name for field in schema.fields]),
+                schema=schema,
+                title=self.title,
+                description=self.description,
+            )
+        else:
+            self.resource = Resource(
+                pd.DataFrame(),
+                title=self.title,
+                description=self.description,
+            )
+        self.output_folder = output_folder
+
+    @property
+    def dataframe(self) -> pd.DataFrame:
+        """Shortcut to the resource data frame, containing the data
+        harvested so far.
+
+        Returns:
+            pd.DataFrame: Data harvested
+        """
+        return self.resource.data
+
+    def append(self, **kwargs):
+        """Append a row to the data frame."""
+        self.dataframe.loc[len(self.dataframe)] = kwargs
+
+    @abstractmethod
+    def harvest(self):
+        """Harvest the data."""
+
+    @property
+    @abstractmethod
+    def reference_data(self):
+        """Handle for the data frame of reference data that is going
+        to be updated."""
+        return None
+
+    @property
+    def municipality(self):
+        """Returns the auxiliary data resource for municipalities."""
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        aux_data_dir = os.path.join(
+            current_dir, "..", "..", "data", "auxiliary", "geographic"
+        )
+        geographic = Package(os.path.join(aux_data_dir, "datapackage.json"))
+        return geographic.get_resource("municipality")
+
+    def fill_municipal_codes(self, frame: pd.DataFrame) -> pd.DataFrame:
+        """Fill municipal codes in harvested data based on state code and
+        normalized municipality name.
+
+        Args:
+            frame (pd.DataFrame): Data frame without municipality codes.
+
+        Returns:
+            pd.DataFrame: Data frame with appropriate municipality codes.
+        """
+        frame["normalized_name"] = (
+            frame["municipality"].str.lower().apply(remove_accents)
+        )
+        frame = frame.drop(["municipality", "municipality_code"], axis=1)
+        codes = self.municipality.to_pandas().loc[:, ["uf", "name", "code"]]
+        codes["normalized_name"] = codes["name"].str.lower().apply(remove_accents)
+        codes = codes.rename(
+            columns={
+                "uf": "state_code",
+                "name": "municipality",
+                "code": "municipality_code",
+            }
+        )
+        merged = frame.merge(codes, on=["state_code", "normalized_name"])
+        merged = merged.reindex(
+            columns=[field.name for field in self.resource.schema.fields]
+        )
+        return merged
+
+    def save(self):
+        """Saves the file with candidate links."""
+        self.resource.data.to_csv(
+            os.path.join(self.output_folder, self.output_file), index=False
+        )
+
+
+class DataScraper(Harvester, ABC):
+    """Harvester for scraping data off websites."""
+
+    source_url: str
+    web_content: str
+
+    def __init__(self, *args, source_url: str = None, **kwargs):
+        self.web_content = None
+        if source_url:
+            self.source_url = source_url
+        super().__init__(*args, **kwargs)
+
+    def fetch(self, url: str):
+        """Fetches the website content from source and keeps it in the
+        DataScraper object.
+        """
+        response = requests.get(
+            url, headers={"user-agent": USER_AGENT}, timeout=TIMEOUT
+        )
+        response.raise_for_status()
+        self.web_content = response.content
+
+    @abstractmethod
+    def parse(self):
+        """Parse the page content and store it in the data frame."""
+
+    def harvest(self):
+        """Scrape the data by fetching and parsing the content of the
+        web page."""
+        self.fetch(self.source_url)
+        self.parse()
+
+
+class WebsiteLinkScraper(DataScraper):
+    """Harvester for scraping institutional website links."""
+
+    output_file: str = "municipality-website-candidate-links.csv"
+
+    def __init__(self, *args, **kwargs):
+        self.schema = self.municipality_website.schema
+        super().__init__(*args, schema=self.schema, **kwargs)
+
+    @property
+    def municipality_website(self):
+        """Returns the valid data resource for institutional websites."""
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid")
+        valid = Package(os.path.join(valid_data_dir, "datapackage.json"))
+        return valid.get_resource("brazilian-municipality-and-state-websites")
+
+    @property
+    def reference_data(self):
+        """Reference data is the municipality websites resource."""
+        return self.municipality_website
+
+
+class TransparencyPortalScraper(DataScraper):
+    """Harvester for scraping transparency portal links."""
+
+    output_file: str = "municipality-transparency-portals-candidate-links.csv"
+
+    def __init__(self, *args, **kwargs):
+        self.schema = self.transparency_portal.schema
+        super().__init__(*args, schema=self.schema, **kwargs)
+
+    @property
+    def transparency_portal(self):
+        """Returns the valid data resource for transparency portals."""
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid")
+        valid = Package(os.path.join(valid_data_dir, "datapackage.json"))
+        return valid.get_resource("brazilian-transparency-and-open-data-portals")
+
+    @property
+    def reference_data(self):
+        """Reference data is the municipality transparency portals resource."""
+        return self.transparency_portal