Skip to content

Commit

Permalink
feat(#8): implement a wrapper to read data-gouv website and download …
Browse files Browse the repository at this point in the history
…file
  • Loading branch information
KameniAlexNea committed Oct 20, 2024
1 parent 96a0fe5 commit 8f43114
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 12 deletions.
7 changes: 0 additions & 7 deletions etl/filter-cameroon.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@
os.environ["OPENAI_API_KEY"] = openai.api_key



start = time.time()


Expand Down Expand Up @@ -136,7 +135,6 @@ def select_relevant_columns(df):
]



df_cameroon_associations = (
df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize)
)
Expand Down Expand Up @@ -311,8 +309,6 @@ def select_relevant_columns(df):
waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES"




def get_dept_region(code_postal):
try:
dept = dept_by_postal_codes[str(code_postal)]
Expand Down Expand Up @@ -424,8 +420,6 @@ def format_libelle_for_gogocarto(df):
)




def remove_space_at_the_end(x: str):
if x is not None:
return x.strip()
Expand Down Expand Up @@ -458,7 +452,6 @@ def normalize_final(data: pd.DataFrame):
df_cameroon_associations = df_cameroon_associations.pipe(normalize_final)



df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv")


Expand Down
14 changes: 9 additions & 5 deletions etl/update_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,35 @@
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
import sys
import runpy

DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/"


def read_data_gouv_page():
headers = {'User-Agent': None}
response = requests.get(DATA_GOUV_PATH, headers=headers)
if 200 <= response.status_code <= 300:
return response.content
raise Exception(response.content)


def download_link(url: str, headers=None):
if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")):
response = requests.get(url, headers=headers)
if (200 <= response.status_code <= 300):
if (200 <= response.status_code <= 300):
name = os.path.basename(url)
with open(name, "wb") as file:
file.write(response.content)
return name



def unzip_and_delete(path: str):
zipped = ZipFile(path)
zipped.extractall(path.replace(".zip", ""))
zipped.close()
return path.replace(".zip", "")


def search_and_download_data():
page = read_data_gouv_page()
soup = BeautifulSoup(page, 'html.parser')
Expand All @@ -46,6 +48,7 @@ def search_and_download_data():
rna_waldec = download_link(rna_waldec)
return rna_waldec


if __name__ == "__main__":
print("Searching for lastest rna waldec version")
path = search_and_download_data()
Expand All @@ -55,4 +58,5 @@ def search_and_download_data():
print("delete zip file")
os.remove(path)
folder = "rna_waldec_20241001"
os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")
os.system(
f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")

0 comments on commit 8f43114

Please sign in to comment.