diff --git a/sources/fecamrn/README.md b/sources/fecamrn/README.md new file mode 100644 index 0000000..e9209db --- /dev/null +++ b/sources/fecamrn/README.md @@ -0,0 +1,8 @@ +The +[Federation of City Councils of the State of Rio Grande do Norte](https://fecamrn.com.br/) +hosts some transparency portals of city councils in the state and has +link to the others. + +All of them are +[on this list](https://fecamrn.com.br/transparencias-das-camaras-municipais). +The data source harvester scrapes the links off that page. diff --git a/tools/harvest/fecamrn/README.md b/tools/harvest/fecamrn/README.md new file mode 100644 index 0000000..f6e5094 --- /dev/null +++ b/tools/harvest/fecamrn/README.md @@ -0,0 +1,20 @@ +# FECAMRN import scripts + +These scripts import data from the FECAMRN data source. For more +information on the rationale and process, see the +[source description here](../../../sources/fecammrn/README.md). + +## Usage + +1. Create a Python virtual environment. This is not required, but it is + recommended. +2. Install the dependencies: + ``` + pip install -r requirements.txt + ``` +3. Run the script: + ``` + python tools/harvest/fecamrn/fecammrn.py + ``` + +Note: Python 3 is required for this script. diff --git a/tools/harvest/fecamrn/fecamrn.py b/tools/harvest/fecamrn/fecamrn.py new file mode 100644 index 0000000..a0e0611 --- /dev/null +++ b/tools/harvest/fecamrn/fecamrn.py @@ -0,0 +1,79 @@ +import os +import argparse + +from bs4 import BeautifulSoup + +from harvest.scrapers import TransparencyPortalScraper + + +class FECAMRNScraper(TransparencyPortalScraper): + source_url: str = "https://fecamrn.com.br/transparencias-das-camaras-municipais" + + def parse(self): + """Parse list of city councils webpage.""" + soup = BeautifulSoup(self.web_content, "html.parser") + links = [ + (p.text.title(), p.find_next("a")["href"]) + for p in soup.find("div", {"class": "texto"}).find_all("p") + if p.text + ] + for name, url in links: + self.append( + state_code="RN", + municipality=name, + sphere="municipal", + branch="legislative", + url=url, + type="SPT", + ) + + +def extract_fecamrn_portals(**kwargs): + """Extract city council transparency portals by scraping FECAMRN's + website. + + Args: + output_folder (str): Path to write the output to. + source_url (str): URL to FECAMRN's website. + """ + scraper = FECAMRNScraper(**kwargs) + scraper.harvest() + scraper.resource.data = scraper.fill_municipal_codes(scraper.dataframe) + scraper.save() + + +def parse_cli() -> dict: + """Parses the command line interface. + + Returns: + dict: A dict containing the values for data_package_path, url. + """ + parser = argparse.ArgumentParser( + description="""Scrapes candidate URLs for council portals """ + """from FECAMRN's website.""" + ) + parser.add_argument( + "output", + help=("path to write the extracted csv to"), + default="", + nargs="?", + ) + parser.add_argument( + "url", + help=("URL for the FECAMRN website"), + default="", + nargs="?", + ) + params = {} + args = parser.parse_args() + if args.output: + params["output_folder"] = args.output + if args.output and not os.path.exists(args.output): + raise FileNotFoundError(f"Folder not found: {args.output}") + params["source_url"] = args.url if args.url else FECAMRNScraper.source_url + return params + + +if __name__ == "__main__": + options = parse_cli() + extract_fecamrn_portals(**options) diff --git a/tools/harvest/scrapers.py b/tools/harvest/scrapers.py new file mode 100644 index 0000000..1f95220 --- /dev/null +++ b/tools/harvest/scrapers.py @@ -0,0 +1,212 @@ +from abc import ABC, abstractmethod +import os +import unicodedata + +import requests +import pandas as pd +from frictionless import Package, Resource, Schema + +from settings import USER_AGENT, DEFAULT_TIMEOUT as TIMEOUT + + +def remove_accents(text: str) -> str: + """Remove accents from text. + + Args: + text (str): The text to remove accents from. + + Returns: + str: The text without accents. + """ + return "".join( + char + for char in unicodedata.normalize("NFD", text) + if not unicodedata.combining(char) + ) + + +class Harvester(ABC): + """Base class for harvesting data sources.""" + + output_folder: str + output_file: str + schema: Schema + title: str = None + description: str = None + + def __init__( + self, + title: str = None, + description: str = None, + schema: Schema = None, + output_folder: str = "data/unverified", + ): + if title: + self.title = title + if description: + self.description = description + if schema: + self.schema = schema + if self.schema: + self.resource = Resource( + pd.DataFrame(columns=[field.name for field in schema.fields]), + schema=schema, + title=self.title, + description=self.description, + ) + else: + self.resource = Resource( + pd.DataFrame(), + title=self.title, + description=self.description, + ) + self.output_folder = output_folder + + @property + def dataframe(self) -> pd.DataFrame: + """Shortcut to the resource data frame, containing the data + harvested so far. + + Returns: + pd.DataFrame: Data harvested + """ + return self.resource.data + + def append(self, **kwargs): + """Append a row to the data frame.""" + self.dataframe.loc[len(self.dataframe)] = kwargs + + @abstractmethod + def harvest(self): + """Harvest the data.""" + + @property + @abstractmethod + def reference_data(self): + """Handle for the data frame of reference data that is going + to be updated.""" + return None + + @property + def municipality(self): + """Returns the auxiliary data resource for municipalities.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + aux_data_dir = os.path.join( + current_dir, "..", "..", "data", "auxiliary", "geographic" + ) + geographic = Package(os.path.join(aux_data_dir, "datapackage.json")) + return geographic.get_resource("municipality") + + def fill_municipal_codes(self, frame: pd.DataFrame) -> pd.DataFrame: + """Fill municipal codes in harvested data based on state code and + normalized municipality name. + + Args: + frame (pd.DataFrame): Data frame without municipality codes. + + Returns: + pd.DataFrame: Data frame with appropriate municipality codes. + """ + frame["normalized_name"] = ( + frame["municipality"].str.lower().apply(remove_accents) + ) + frame = frame.drop(["municipality", "municipality_code"], axis=1) + codes = self.municipality.to_pandas().loc[:, ["uf", "name", "code"]] + codes["normalized_name"] = codes["name"].str.lower().apply(remove_accents) + codes = codes.rename( + columns={ + "uf": "state_code", + "name": "municipality", + "code": "municipality_code", + } + ) + merged = frame.merge(codes, on=["state_code", "normalized_name"]) + merged = merged.reindex( + columns=[field.name for field in self.resource.schema.fields] + ) + return merged + + def save(self): + """Saves the file with candidate links.""" + self.resource.data.to_csv( + os.path.join(self.output_folder, self.output_file), index=False + ) + + +class DataScraper(Harvester, ABC): + """Harvester for scraping data off websites.""" + + source_url: str + web_content: str + + def __init__(self, *args, source_url: str = None, **kwargs): + self.web_content = None + if source_url: + self.source_url = source_url + super().__init__(*args, **kwargs) + + def fetch(self, url: str): + """Fetches the website content from source and keeps it in the + DataScraper object. + """ + response = requests.get( + url, headers={"user-agent": USER_AGENT}, timeout=TIMEOUT + ) + response.raise_for_status() + self.web_content = response.content + + @abstractmethod + def parse(self): + """Parse the page content and store it in the data frame.""" + + def harvest(self): + """Scrape the data by fetching and parsing the content of the + web page.""" + self.fetch(self.source_url) + self.parse() + + +class WebsiteLinkScraper(DataScraper): + """Harvester for scraping institutional website links.""" + + output_file: str = "municipality-website-candidate-links.csv" + + def __init__(self, *args, **kwargs): + self.schema = self.municipality_website.schema + super().__init__(*args, schema=self.schema, **kwargs) + + @property + def municipality_website(self): + """Returns the valid data resource for institutional websites.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid") + valid = Package(os.path.join(valid_data_dir, "datapackage.json")) + return valid.get_resource("brazilian-municipality-and-state-websites") + + @property + def reference_data(self): + """Reference data is the municipality websites resource.""" + return self.municipality_website + + +class TransparencyPortalScraper(DataScraper): + """Harvester for scraping transparency portal links.""" + + output_file: str = "municipality-transparency-portals-candidate-links.csv" + + def __init__(self, *args, **kwargs): + self.schema = self.transparency_portal.schema + super().__init__(*args, schema=self.schema, **kwargs) + + @property + def transparency_portal(self): + """Returns the valid data resource for transparency portals.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + valid_data_dir = os.path.join(current_dir, "..", "..", "data", "valid") + valid = Package(os.path.join(valid_data_dir, "datapackage.json")) + return valid.get_resource("brazilian-transparency-and-open-data-portals") + + @property + def reference_data(self): + """Reference data is the municipality transparency portals resource.""" + return self.transparency_portal