feat(#8): implement a wrapper to read data-gouv website and download …

…file
mongulu-cm · Oct 20, 2024 · 8f43114 · 8f43114
1 parent 96a0fe5
commit 8f43114
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 12 deletions.
diff --git a/etl/filter-cameroon.py b/etl/filter-cameroon.py
@@ -71,7 +71,6 @@
 os.environ["OPENAI_API_KEY"] = openai.api_key
 
 
-
 start = time.time()
 
 
@@ -136,7 +135,6 @@ def select_relevant_columns(df):
     ]
 
 
-
 df_cameroon_associations = (
     df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize)
 )
@@ -311,8 +309,6 @@ def select_relevant_columns(df):
 waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES"
 
 
-
-
 def get_dept_region(code_postal):
     try:
         dept = dept_by_postal_codes[str(code_postal)]
@@ -424,8 +420,6 @@ def format_libelle_for_gogocarto(df):
 )
 
 
-
-
 def remove_space_at_the_end(x: str):
     if x is not None:
         return x.strip()
@@ -458,7 +452,6 @@ def normalize_final(data: pd.DataFrame):
 df_cameroon_associations = df_cameroon_associations.pipe(normalize_final)
 
 
-
 df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv")
 
 

diff --git a/etl/update_database.py b/etl/update_database.py
@@ -2,33 +2,35 @@
 import requests
 from bs4 import BeautifulSoup
 from zipfile import ZipFile
-import sys
-import runpy
 
 DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/"
 
+
 def read_data_gouv_page():
     headers = {'User-Agent': None}
     response = requests.get(DATA_GOUV_PATH, headers=headers)
     if 200 <= response.status_code <= 300:
         return response.content
     raise Exception(response.content)
 
+
 def download_link(url: str, headers=None):
     if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")):
         response = requests.get(url, headers=headers)
-        if  (200 <= response.status_code <= 300):
+        if (200 <= response.status_code <= 300):
             name = os.path.basename(url)
             with open(name, "wb") as file:
                 file.write(response.content)
             return name
-
+
+
 def unzip_and_delete(path: str):
     zipped = ZipFile(path)
     zipped.extractall(path.replace(".zip", ""))
     zipped.close()
     return path.replace(".zip", "")
 
+
 def search_and_download_data():
     page = read_data_gouv_page()
     soup = BeautifulSoup(page, 'html.parser')
@@ -46,6 +48,7 @@ def search_and_download_data():
     rna_waldec = download_link(rna_waldec)
     return rna_waldec
 
+
 if __name__ == "__main__":
     print("Searching for lastest rna waldec version")
     path = search_and_download_data()
@@ -55,4 +58,5 @@ def search_and_download_data():
     print("delete zip file")
     os.remove(path)
     folder = "rna_waldec_20241001"
-    os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")
+    os.system(
+        f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")