Skip to content

Commit

Permalink
feat(#8): implement a wrapper to read data-gouv website and download …
Browse files Browse the repository at this point in the history
…file
  • Loading branch information
KameniAlexNea committed Oct 20, 2024
1 parent 5ee55b2 commit 96a0fe5
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 29 deletions.
43 changes: 19 additions & 24 deletions etl/filter-cameroon.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# %%

# CSV Files downloaded from https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/ Fichier RNA Waldec du 01 Mars 2022
import datetime as dt
import glob
Expand All @@ -15,10 +15,15 @@
from lambdaprompt import GPT3Prompt
from pandarallel import pandarallel
from rich.console import Console
from argparse import Namespace, ArgumentParser

parser = ArgumentParser()
parser.add_argument("--rna_folder", default="rna_waldec_20220301/")

args, _ = parser.parse_known_args()

# %%
start = time.time()
file_location = os.getcwd() + "/rna_waldec_20220301/"
file_location = os.path.join(os.getcwd(), args.rna_folder)
all_files = glob.glob(os.path.join(file_location, "*.csv"))

columns = [
Expand All @@ -42,9 +47,10 @@
f,
delimiter=";",
header=0,
encoding="ISO-8859-1",
# encoding="ISO-8859-1",
usecols=columns,
engine="c",
low_memory=False
)
for f in all_files
],
Expand All @@ -54,7 +60,7 @@
end = time.time()
print(f"Time to read all CSV : {dt.timedelta(seconds=end - start)}")

# %%

ssm = boto3.client("ssm", region_name="eu-central-1")

openai.api_key = ssm.get_parameter(
Expand All @@ -65,7 +71,7 @@
os.environ["OPENAI_API_KEY"] = openai.api_key


# %%

start = time.time()


Expand Down Expand Up @@ -138,7 +144,7 @@ def select_relevant_columns(df):
end = time.time()
print(f"Time to Filter Rows : {dt.timedelta(seconds=end - start)}")

# %%

text_prompt = """
Normalize the addresses in french.
Don't ignore any lines and treat each address separetely and go step by step
Expand Down Expand Up @@ -223,17 +229,6 @@ def select_relevant_columns(df):
all_adresses = [x.strip() for x in all_adresses]

# Build adresse by concatenation
df2["adrs"] = (
df2["adrs_numvoie"].map(str)
+ " "
+ df2["adrs_typevoie"].map(str)
+ " "
+ df2["adrs_libvoie"].map(str)
+ " "
+ df2["adrs_codepostal"].map(str)
+ " "
+ df2["adrs_libcommune"].map(str)
)
df_cameroon_associations["adrs"] = (
df_cameroon_associations["adrs_numvoie"].map(str)
+ " "
Expand All @@ -258,7 +253,7 @@ def select_relevant_columns(df):
]

print(f"{len(df_not_in_cache)} adresses not present in cache...")
# %%

if len(df_not_in_cache) > 0:
num_batches = int(np.ceil(len(df_not_in_cache) / 25))
batches = np.array_split(df_not_in_cache, num_batches)
Expand All @@ -280,7 +275,7 @@ def select_relevant_columns(df):
time.sleep(120)
batch["adrs"] = cache[list_adresses]

# %%

# Downloaded from https://download.geonames.org/export/zip/
region_by_postal_codes = pd.read_csv(
"code-postal-geonames.tsv", delimiter="\t", index_col=1
Expand Down Expand Up @@ -316,7 +311,7 @@ def select_relevant_columns(df):
waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES"


# %%


def get_dept_region(code_postal):
try:
Expand Down Expand Up @@ -372,7 +367,7 @@ def add_social_object_libelle(df):
# get_info("W212001727")
# get_dept_region(30913)

# %%

pandarallel.initialize(progress_bar=True)
requests_cache.install_cache("geocode_cache")

Expand Down Expand Up @@ -428,7 +423,7 @@ def format_libelle_for_gogocarto(df):
format_libelle_for_gogocarto
)

# %%



def remove_space_at_the_end(x: str):
Expand Down Expand Up @@ -463,7 +458,7 @@ def normalize_final(data: pd.DataFrame):
df_cameroon_associations = df_cameroon_associations.pipe(normalize_final)


# %%

df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv")


Expand Down
27 changes: 22 additions & 5 deletions etl/update_database.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
import sys
import runpy

DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/"

Expand All @@ -19,6 +22,12 @@ def download_link(url: str, headers=None):
with open(name, "wb") as file:
file.write(response.content)
return name

def unzip_and_delete(path: str):
zipped = ZipFile(path)
zipped.extractall(path.replace(".zip", ""))
zipped.close()
return path.replace(".zip", "")

def search_and_download_data():
page = read_data_gouv_page()
Expand All @@ -27,15 +36,23 @@ def search_and_download_data():
links: list[str] = [
i["href"] for i in links if ("media.interieur.gouv" in i["href"])
]
rna_import = [i for i in links if "rna_import" in i]
# rna_import = [i for i in links if "rna_import" in i]
rna_waldec = [i for i in links if "rna_waldec" in i]

rna_import = sorted(rna_import, reverse=True)[0]
# rna_import = sorted(rna_import, reverse=True)[0]
rna_waldec = sorted(rna_waldec, reverse=True)[0]

rna_import = download_link(rna_import)
# rna_import = download_link(rna_import)
rna_waldec = download_link(rna_waldec)
return rna_import, rna_waldec
return rna_waldec

if __name__ == "__main__":
search_and_download_data()
print("Searching for lastest rna waldec version")
path = search_and_download_data()
folder = path.replace(".zip", "")
print("extracting rna data")
unzip_and_delete(path)
print("delete zip file")
os.remove(path)
folder = "rna_waldec_20241001"
os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")

0 comments on commit 96a0fe5

Please sign in to comment.