-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add scraper for FECAMRN's website (#156)
- Loading branch information
1 parent
a602b27
commit 2623eda
Showing
4 changed files
with
319 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
The | ||
[Federation of City Councils of the State of Rio Grande do Norte](https://fecamrn.com.br/) | ||
hosts some transparency portals of city councils in the state and has | ||
link to the others. | ||
|
||
All of them are | ||
[on this list](https://fecamrn.com.br/transparencias-das-camaras-municipais). | ||
The data source harvester scrapes the links off that page. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# FECAMRN import scripts | ||
|
||
These scripts import data from the FECAMRN data source. For more | ||
information on the rationale and process, see the | ||
[source description here](../../../sources/fecammrn/README.md). | ||
|
||
## Usage | ||
|
||
1. Create a Python virtual environment. This is not required, but it is | ||
recommended. | ||
2. Install the dependencies: | ||
``` | ||
pip install -r requirements.txt | ||
``` | ||
3. Run the script: | ||
``` | ||
python tools/harvest/fecamrn/fecammrn.py | ||
``` | ||
|
||
Note: Python 3 is required for this script. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import os | ||
import argparse | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from harvest.scrapers import TransparencyPortalScraper | ||
|
||
|
||
class FECAMRNScraper(TransparencyPortalScraper): | ||
source_url: str = "https://fecamrn.com.br/transparencias-das-camaras-municipais" | ||
|
||
def parse(self): | ||
"""Parse list of city councils webpage.""" | ||
soup = BeautifulSoup(self.web_content, "html.parser") | ||
links = [ | ||
(p.text.title(), p.find_next("a")["href"]) | ||
for p in soup.find("div", {"class": "texto"}).find_all("p") | ||
if p.text | ||
] | ||
for name, url in links: | ||
self.append( | ||
state_code="RN", | ||
municipality=name, | ||
sphere="municipal", | ||
branch="legislative", | ||
url=url, | ||
type="SPT", | ||
) | ||
|
||
|
||
def extract_fecamrn_portals(**kwargs): | ||
"""Extract city council transparency portals by scraping FECAMRN's | ||
website. | ||
Args: | ||
output_folder (str): Path to write the output to. | ||
source_url (str): URL to FECAMRN's website. | ||
""" | ||
scraper = FECAMRNScraper(**kwargs) | ||
scraper.harvest() | ||
scraper.resource.data = scraper.fill_municipal_codes(scraper.dataframe) | ||
scraper.save() | ||
|
||
|
||
def parse_cli() -> dict: | ||
"""Parses the command line interface. | ||
Returns: | ||
dict: A dict containing the values for data_package_path, url. | ||
""" | ||
parser = argparse.ArgumentParser( | ||
description="""Scrapes candidate URLs for council portals """ | ||
"""from FECAMRN's website.""" | ||
) | ||
parser.add_argument( | ||
"output", | ||
help=("path to write the extracted csv to"), | ||
default="", | ||
nargs="?", | ||
) | ||
parser.add_argument( | ||
"url", | ||
help=("URL for the FECAMRN website"), | ||
default="", | ||
nargs="?", | ||
) | ||
params = {} | ||
args = parser.parse_args() | ||
if args.output: | ||
params["output_folder"] = args.output | ||
if args.output and not os.path.exists(args.output): | ||
raise FileNotFoundError(f"Folder not found: {args.output}") | ||
params["source_url"] = args.url if args.url else FECAMRNScraper.source_url | ||
return params | ||
|
||
|
||
if __name__ == "__main__": | ||
options = parse_cli() | ||
extract_fecamrn_portals(**options) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
from abc import ABC, abstractmethod | ||
import os | ||
import unicodedata | ||
|
||
import requests | ||
import pandas as pd | ||
from frictionless import Package, Resource, Schema | ||
|
||
from settings import USER_AGENT, DEFAULT_TIMEOUT as TIMEOUT | ||
|
||
|
||
def remove_accents(text: str) -> str: | ||
"""Remove accents from text. | ||
Args: | ||
text (str): The text to remove accents from. | ||
Returns: | ||
str: The text without accents. | ||
""" | ||
return "".join( | ||
char | ||
for char in unicodedata.normalize("NFD", text) | ||
if not unicodedata.combining(char) | ||
) | ||
|
||
|
||
class Harvester(ABC): | ||
"""Base class for harvesting data sources.""" | ||
|
||
output_folder: str | ||
output_file: str | ||
schema: Schema | ||
title: str = None | ||
description: str = None | ||
|
||
def __init__( | ||
self, | ||
title: str = None, | ||
description: str = None, | ||
schema: Schema = None, | ||
output_folder: str = "data/unverified", | ||
): | ||
if title: | ||
self.title = title | ||
if description: | ||
self.description = description | ||
if schema: | ||
self.schema = schema | ||
if self.schema: | ||
self.resource = Resource( | ||
pd.DataFrame(columns=[field.name for field in schema.fields]), | ||
schema=schema, | ||
title=self.title, | ||
description=self.description, | ||
) | ||
else: | ||
self.resource = Resource( | ||
pd.DataFrame(), | ||
title=self.title, | ||
description=self.description, | ||
) | ||
self.output_folder = output_folder | ||
|
||
@property | ||
def dataframe(self) -> pd.DataFrame: | ||
"""Shortcut to the resource data frame, containing the data | ||
harvested so far. | ||
Returns: | ||
pd.DataFrame: Data harvested | ||
""" | ||
return self.resource.data | ||
|
||
def append(self, **kwargs): | ||
"""Append a row to the data frame.""" | ||
self.dataframe.loc[len(self.dataframe)] = kwargs | ||
|
||
@abstractmethod | ||
def harvest(self): | ||
"""Harvest the data.""" | ||
|
||
@property | ||
@abstractmethod | ||
def reference_data(self): | ||
"""Handle for the data frame of reference data that is going | ||
to be updated.""" | ||
return None | ||
|
||
@property | ||
def municipality(self): | ||
"""Returns the auxiliary data resource for municipalities.""" | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
aux_data_dir = os.path.join( | ||
current_dir, "..", "..", "data", "auxiliary", "geographic" | ||
) | ||
geographic = Package(os.path.join(aux_data_dir, "datapackage.json")) | ||
return geographic.get_resource("municipality") | ||
|
||
def fill_municipal_codes(self, frame: pd.DataFrame) -> pd.DataFrame: | ||
"""Fill municipal codes in harvested data based on state code and | ||
normalized municipality name. | ||
Args: | ||
frame (pd.DataFrame): Data frame without municipality codes. | ||
Returns: | ||
pd.DataFrame: Data frame with appropriate municipality codes. | ||
""" | ||
frame["normalized_name"] = ( | ||
frame["municipality"].str.lower().apply(remove_accents) | ||
) | ||
frame = frame.drop(["municipality", "municipality_code"], axis=1) | ||
codes = self.municipality.to_pandas().loc[:, ["uf", "name", "code"]] | ||
codes["normalized_name"] = codes["name"].str.lower().apply(remove_accents) | ||
codes = codes.rename( | ||
columns={ | ||
"uf": "state_code", | ||
"name": "municipality", | ||
"code": "municipality_code", | ||
} | ||
) | ||
merged = frame.merge(codes, on=["state_code", "normalized_name"]) | ||
merged = merged.reindex( | ||
columns=[field.name for field in self.resource.schema.fields] | ||
) | ||
return merged | ||
|
||
def save(self): | ||
"""Saves the file with candidate links.""" | ||
self.resource.data.to_csv( | ||
os.path.join(self.output_folder, self.output_file), index=False | ||
) | ||
|
||
|
||
class DataScraper(Harvester, ABC): | ||
"""Harvester for scraping data off websites.""" | ||
|
||
source_url: str | ||
web_content: str | ||
|
||
def __init__(self, *args, source_url: str = None, **kwargs): | ||
self.web_content = None | ||
if source_url: | ||
self.source_url = source_url | ||
super().__init__(*args, **kwargs) | ||
|
||
def fetch(self, url: str): | ||
"""Fetches the website content from source and keeps it in the | ||
DataScraper object. | ||
""" | ||
response = requests.get( | ||
url, headers={"user-agent": USER_AGENT}, timeout=TIMEOUT | ||
) | ||
response.raise_for_status() | ||
self.web_content = response.content | ||
|
||
@abstractmethod | ||
def parse(self): | ||
"""Parse the page content and store it in the data frame.""" | ||
|
||
def harvest(self): | ||
"""Scrape the data by fetching and parsing the content of the | ||
web page.""" | ||
self.fetch(self.source_url) | ||
self.parse() | ||
|
||
|
||
class WebsiteLinkScraper(DataScraper): | ||
"""Harvester for scraping institutional website links.""" | ||
|
||
output_file: str = "municipality-website-candidate-links.csv" | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.schema = self.municipality_website.schema | ||
super().__init__(*args, schema=self.schema, **kwargs) | ||
|
||
@property | ||
def municipality_website(self): | ||
"""Returns the valid data resource for institutional websites.""" | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid") | ||
valid = Package(os.path.join(valid_data_dir, "datapackage.json")) | ||
return valid.get_resource("brazilian-municipality-and-state-websites") | ||
|
||
@property | ||
def reference_data(self): | ||
"""Reference data is the municipality websites resource.""" | ||
return self.municipality_website | ||
|
||
|
||
class TransparencyPortalScraper(DataScraper): | ||
"""Harvester for scraping transparency portal links.""" | ||
|
||
output_file: str = "municipality-transparency-portals-candidate-links.csv" | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.schema = self.transparency_portal.schema | ||
super().__init__(*args, schema=self.schema, **kwargs) | ||
|
||
@property | ||
def transparency_portal(self): | ||
"""Returns the valid data resource for transparency portals.""" | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid") | ||
valid = Package(os.path.join(valid_data_dir, "datapackage.json")) | ||
return valid.get_resource("brazilian-transparency-and-open-data-portals") | ||
|
||
@property | ||
def reference_data(self): | ||
"""Reference data is the municipality transparency portals resource.""" | ||
return self.transparency_portal |