Skip to content

Commit

Permalink
Add scraper for FECAMRN's website (#156)
Browse files Browse the repository at this point in the history
  • Loading branch information
augusto-herrmann committed Mar 6, 2023
1 parent a602b27 commit 2623eda
Show file tree
Hide file tree
Showing 4 changed files with 319 additions and 0 deletions.
8 changes: 8 additions & 0 deletions sources/fecamrn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The
[Federation of City Councils of the State of Rio Grande do Norte](https://fecamrn.com.br/)
hosts some transparency portals of city councils in the state and has
link to the others.

All of them are
[on this list](https://fecamrn.com.br/transparencias-das-camaras-municipais).
The data source harvester scrapes the links off that page.
20 changes: 20 additions & 0 deletions tools/harvest/fecamrn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# FECAMRN import scripts

These scripts import data from the FECAMRN data source. For more
information on the rationale and process, see the
[source description here](../../../sources/fecammrn/README.md).

## Usage

1. Create a Python virtual environment. This is not required, but it is
recommended.
2. Install the dependencies:
```
pip install -r requirements.txt
```
3. Run the script:
```
python tools/harvest/fecamrn/fecammrn.py
```

Note: Python 3 is required for this script.
79 changes: 79 additions & 0 deletions tools/harvest/fecamrn/fecamrn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import argparse

from bs4 import BeautifulSoup

from harvest.scrapers import TransparencyPortalScraper


class FECAMRNScraper(TransparencyPortalScraper):
source_url: str = "https://fecamrn.com.br/transparencias-das-camaras-municipais"

def parse(self):
"""Parse list of city councils webpage."""
soup = BeautifulSoup(self.web_content, "html.parser")
links = [
(p.text.title(), p.find_next("a")["href"])
for p in soup.find("div", {"class": "texto"}).find_all("p")
if p.text
]
for name, url in links:
self.append(
state_code="RN",
municipality=name,
sphere="municipal",
branch="legislative",
url=url,
type="SPT",
)


def extract_fecamrn_portals(**kwargs):
"""Extract city council transparency portals by scraping FECAMRN's
website.
Args:
output_folder (str): Path to write the output to.
source_url (str): URL to FECAMRN's website.
"""
scraper = FECAMRNScraper(**kwargs)
scraper.harvest()
scraper.resource.data = scraper.fill_municipal_codes(scraper.dataframe)
scraper.save()


def parse_cli() -> dict:
"""Parses the command line interface.
Returns:
dict: A dict containing the values for data_package_path, url.
"""
parser = argparse.ArgumentParser(
description="""Scrapes candidate URLs for council portals """
"""from FECAMRN's website."""
)
parser.add_argument(
"output",
help=("path to write the extracted csv to"),
default="",
nargs="?",
)
parser.add_argument(
"url",
help=("URL for the FECAMRN website"),
default="",
nargs="?",
)
params = {}
args = parser.parse_args()
if args.output:
params["output_folder"] = args.output
if args.output and not os.path.exists(args.output):
raise FileNotFoundError(f"Folder not found: {args.output}")
params["source_url"] = args.url if args.url else FECAMRNScraper.source_url
return params


if __name__ == "__main__":
options = parse_cli()
extract_fecamrn_portals(**options)
212 changes: 212 additions & 0 deletions tools/harvest/scrapers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
from abc import ABC, abstractmethod
import os
import unicodedata

import requests
import pandas as pd
from frictionless import Package, Resource, Schema

from settings import USER_AGENT, DEFAULT_TIMEOUT as TIMEOUT


def remove_accents(text: str) -> str:
"""Remove accents from text.
Args:
text (str): The text to remove accents from.
Returns:
str: The text without accents.
"""
return "".join(
char
for char in unicodedata.normalize("NFD", text)
if not unicodedata.combining(char)
)


class Harvester(ABC):
"""Base class for harvesting data sources."""

output_folder: str
output_file: str
schema: Schema
title: str = None
description: str = None

def __init__(
self,
title: str = None,
description: str = None,
schema: Schema = None,
output_folder: str = "data/unverified",
):
if title:
self.title = title
if description:
self.description = description
if schema:
self.schema = schema
if self.schema:
self.resource = Resource(
pd.DataFrame(columns=[field.name for field in schema.fields]),
schema=schema,
title=self.title,
description=self.description,
)
else:
self.resource = Resource(
pd.DataFrame(),
title=self.title,
description=self.description,
)
self.output_folder = output_folder

@property
def dataframe(self) -> pd.DataFrame:
"""Shortcut to the resource data frame, containing the data
harvested so far.
Returns:
pd.DataFrame: Data harvested
"""
return self.resource.data

def append(self, **kwargs):
"""Append a row to the data frame."""
self.dataframe.loc[len(self.dataframe)] = kwargs

@abstractmethod
def harvest(self):
"""Harvest the data."""

@property
@abstractmethod
def reference_data(self):
"""Handle for the data frame of reference data that is going
to be updated."""
return None

@property
def municipality(self):
"""Returns the auxiliary data resource for municipalities."""
current_dir = os.path.dirname(os.path.abspath(__file__))
aux_data_dir = os.path.join(
current_dir, "..", "..", "data", "auxiliary", "geographic"
)
geographic = Package(os.path.join(aux_data_dir, "datapackage.json"))
return geographic.get_resource("municipality")

def fill_municipal_codes(self, frame: pd.DataFrame) -> pd.DataFrame:
"""Fill municipal codes in harvested data based on state code and
normalized municipality name.
Args:
frame (pd.DataFrame): Data frame without municipality codes.
Returns:
pd.DataFrame: Data frame with appropriate municipality codes.
"""
frame["normalized_name"] = (
frame["municipality"].str.lower().apply(remove_accents)
)
frame = frame.drop(["municipality", "municipality_code"], axis=1)
codes = self.municipality.to_pandas().loc[:, ["uf", "name", "code"]]
codes["normalized_name"] = codes["name"].str.lower().apply(remove_accents)
codes = codes.rename(
columns={
"uf": "state_code",
"name": "municipality",
"code": "municipality_code",
}
)
merged = frame.merge(codes, on=["state_code", "normalized_name"])
merged = merged.reindex(
columns=[field.name for field in self.resource.schema.fields]
)
return merged

def save(self):
"""Saves the file with candidate links."""
self.resource.data.to_csv(
os.path.join(self.output_folder, self.output_file), index=False
)


class DataScraper(Harvester, ABC):
"""Harvester for scraping data off websites."""

source_url: str
web_content: str

def __init__(self, *args, source_url: str = None, **kwargs):
self.web_content = None
if source_url:
self.source_url = source_url
super().__init__(*args, **kwargs)

def fetch(self, url: str):
"""Fetches the website content from source and keeps it in the
DataScraper object.
"""
response = requests.get(
url, headers={"user-agent": USER_AGENT}, timeout=TIMEOUT
)
response.raise_for_status()
self.web_content = response.content

@abstractmethod
def parse(self):
"""Parse the page content and store it in the data frame."""

def harvest(self):
"""Scrape the data by fetching and parsing the content of the
web page."""
self.fetch(self.source_url)
self.parse()


class WebsiteLinkScraper(DataScraper):
"""Harvester for scraping institutional website links."""

output_file: str = "municipality-website-candidate-links.csv"

def __init__(self, *args, **kwargs):
self.schema = self.municipality_website.schema
super().__init__(*args, schema=self.schema, **kwargs)

@property
def municipality_website(self):
"""Returns the valid data resource for institutional websites."""
current_dir = os.path.dirname(os.path.abspath(__file__))
valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid")
valid = Package(os.path.join(valid_data_dir, "datapackage.json"))
return valid.get_resource("brazilian-municipality-and-state-websites")

@property
def reference_data(self):
"""Reference data is the municipality websites resource."""
return self.municipality_website


class TransparencyPortalScraper(DataScraper):
"""Harvester for scraping transparency portal links."""

output_file: str = "municipality-transparency-portals-candidate-links.csv"

def __init__(self, *args, **kwargs):
self.schema = self.transparency_portal.schema
super().__init__(*args, schema=self.schema, **kwargs)

@property
def transparency_portal(self):
"""Returns the valid data resource for transparency portals."""
current_dir = os.path.dirname(os.path.abspath(__file__))
valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid")
valid = Package(os.path.join(valid_data_dir, "datapackage.json"))
return valid.get_resource("brazilian-transparency-and-open-data-portals")

@property
def reference_data(self):
"""Reference data is the municipality transparency portals resource."""
return self.transparency_portal

0 comments on commit 2623eda

Please sign in to comment.