From 8f431149ffdfe4eeb1a9b53a8944f336c62d57f8 Mon Sep 17 00:00:00 2001 From: KameniAlexNea Date: Sun, 20 Oct 2024 14:10:51 +0200 Subject: [PATCH] feat(#8): implement a wrapper to read data-gouv website and download file --- etl/filter-cameroon.py | 7 ------- etl/update_database.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/etl/filter-cameroon.py b/etl/filter-cameroon.py index 8d6f508..b664033 100644 --- a/etl/filter-cameroon.py +++ b/etl/filter-cameroon.py @@ -71,7 +71,6 @@ os.environ["OPENAI_API_KEY"] = openai.api_key - start = time.time() @@ -136,7 +135,6 @@ def select_relevant_columns(df): ] - df_cameroon_associations = ( df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize) ) @@ -311,8 +309,6 @@ def select_relevant_columns(df): waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES" - - def get_dept_region(code_postal): try: dept = dept_by_postal_codes[str(code_postal)] @@ -424,8 +420,6 @@ def format_libelle_for_gogocarto(df): ) - - def remove_space_at_the_end(x: str): if x is not None: return x.strip() @@ -458,7 +452,6 @@ def normalize_final(data: pd.DataFrame): df_cameroon_associations = df_cameroon_associations.pipe(normalize_final) - df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv") diff --git a/etl/update_database.py b/etl/update_database.py index 864aa57..bf48c2c 100644 --- a/etl/update_database.py +++ b/etl/update_database.py @@ -2,11 +2,10 @@ import requests from bs4 import BeautifulSoup from zipfile import ZipFile -import sys -import runpy DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/" + def read_data_gouv_page(): headers = {'User-Agent': None} response = requests.get(DATA_GOUV_PATH, headers=headers) @@ -14,21 +13,24 @@ def read_data_gouv_page(): return response.content raise Exception(response.content) + def download_link(url: str, headers=None): if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")): response = requests.get(url, headers=headers) - if (200 <= response.status_code <= 300): + if (200 <= response.status_code <= 300): name = os.path.basename(url) with open(name, "wb") as file: file.write(response.content) return name - + + def unzip_and_delete(path: str): zipped = ZipFile(path) zipped.extractall(path.replace(".zip", "")) zipped.close() return path.replace(".zip", "") + def search_and_download_data(): page = read_data_gouv_page() soup = BeautifulSoup(page, 'html.parser') @@ -46,6 +48,7 @@ def search_and_download_data(): rna_waldec = download_link(rna_waldec) return rna_waldec + if __name__ == "__main__": print("Searching for lastest rna waldec version") path = search_and_download_data() @@ -55,4 +58,5 @@ def search_and_download_data(): print("delete zip file") os.remove(path) folder = "rna_waldec_20241001" - os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'") \ No newline at end of file + os.system( + f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")