Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(#8): implement a wrapper to read data-gouv website and download … #131

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 15 additions & 27 deletions etl/filter-cameroon.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# %%

# CSV Files downloaded from https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/ Fichier RNA Waldec du 01 Mars 2022
import datetime as dt
import glob
Expand All @@ -15,10 +15,15 @@
from lambdaprompt import GPT3Prompt
from pandarallel import pandarallel
from rich.console import Console
from argparse import Namespace, ArgumentParser

parser = ArgumentParser()
parser.add_argument("--rna_folder", default="rna_waldec_20220301/")

args, _ = parser.parse_known_args()

# %%
start = time.time()
file_location = os.getcwd() + "/rna_waldec_20220301/"
file_location = os.path.join(os.getcwd(), args.rna_folder)
all_files = glob.glob(os.path.join(file_location, "*.csv"))

columns = [
Expand All @@ -42,9 +47,10 @@
f,
delimiter=";",
header=0,
encoding="ISO-8859-1",
# encoding="ISO-8859-1",
usecols=columns,
engine="c",
low_memory=False
)
for f in all_files
],
Expand All @@ -54,7 +60,7 @@
end = time.time()
print(f"Time to read all CSV : {dt.timedelta(seconds=end - start)}")

# %%

ssm = boto3.client("ssm", region_name="eu-central-1")

openai.api_key = ssm.get_parameter(
Expand All @@ -65,7 +71,6 @@
os.environ["OPENAI_API_KEY"] = openai.api_key


# %%
start = time.time()


Expand Down Expand Up @@ -130,15 +135,14 @@ def select_relevant_columns(df):
]



df_cameroon_associations = (
df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize)
)

end = time.time()
print(f"Time to Filter Rows : {dt.timedelta(seconds=end - start)}")

# %%

text_prompt = """
Normalize the addresses in french.
Don't ignore any lines and treat each address separetely and go step by step
Expand Down Expand Up @@ -223,17 +227,6 @@ def select_relevant_columns(df):
all_adresses = [x.strip() for x in all_adresses]

# Build adresse by concatenation
df2["adrs"] = (
df2["adrs_numvoie"].map(str)
+ " "
+ df2["adrs_typevoie"].map(str)
+ " "
+ df2["adrs_libvoie"].map(str)
+ " "
+ df2["adrs_codepostal"].map(str)
+ " "
+ df2["adrs_libcommune"].map(str)
)
df_cameroon_associations["adrs"] = (
df_cameroon_associations["adrs_numvoie"].map(str)
+ " "
Expand All @@ -258,7 +251,7 @@ def select_relevant_columns(df):
]

print(f"{len(df_not_in_cache)} adresses not present in cache...")
# %%

if len(df_not_in_cache) > 0:
num_batches = int(np.ceil(len(df_not_in_cache) / 25))
batches = np.array_split(df_not_in_cache, num_batches)
Expand All @@ -280,7 +273,7 @@ def select_relevant_columns(df):
time.sleep(120)
batch["adrs"] = cache[list_adresses]

# %%

# Downloaded from https://download.geonames.org/export/zip/
region_by_postal_codes = pd.read_csv(
"code-postal-geonames.tsv", delimiter="\t", index_col=1
Expand Down Expand Up @@ -316,8 +309,6 @@ def select_relevant_columns(df):
waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES"


# %%

def get_dept_region(code_postal):
try:
dept = dept_by_postal_codes[str(code_postal)]
Expand Down Expand Up @@ -372,7 +363,7 @@ def add_social_object_libelle(df):
# get_info("W212001727")
# get_dept_region(30913)

# %%

pandarallel.initialize(progress_bar=True)
requests_cache.install_cache("geocode_cache")

Expand Down Expand Up @@ -428,8 +419,6 @@ def format_libelle_for_gogocarto(df):
format_libelle_for_gogocarto
)

# %%


def remove_space_at_the_end(x: str):
if x is not None:
Expand Down Expand Up @@ -463,7 +452,6 @@ def normalize_final(data: pd.DataFrame):
df_cameroon_associations = df_cameroon_associations.pipe(normalize_final)


# %%
df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv")


Expand Down
1 change: 1 addition & 0 deletions etl/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ chainlit==0.5.1
tornado>=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability
aiohttp>=3.9.0 # not directly required, pinned by Snyk to avoid a vulnerability
sentry_sdk==1.39.1
beautifulsoup4==4.12.3
62 changes: 62 additions & 0 deletions etl/update_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile

DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/"


def read_data_gouv_page():
headers = {'User-Agent': None}
response = requests.get(DATA_GOUV_PATH, headers=headers)
if 200 <= response.status_code <= 300:
return response.content
raise Exception(response.content)


def download_link(url: str, headers=None):
if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")):
response = requests.get(url, headers=headers)
if (200 <= response.status_code <= 300):
name = os.path.basename(url)
with open(name, "wb") as file:
file.write(response.content)
return name


def unzip_and_delete(path: str):
zipped = ZipFile(path)
zipped.extractall(path.replace(".zip", ""))
zipped.close()
return path.replace(".zip", "")


def search_and_download_data():
page = read_data_gouv_page()
soup = BeautifulSoup(page, 'html.parser')
links = soup.find_all('a', href=True)
links: list[str] = [
i["href"] for i in links if ("media.interieur.gouv" in i["href"])
]
# rna_import = [i for i in links if "rna_import" in i]
rna_waldec = [i for i in links if "rna_waldec" in i]

# rna_import = sorted(rna_import, reverse=True)[0]
rna_waldec = sorted(rna_waldec, reverse=True)[0]

# rna_import = download_link(rna_import)
rna_waldec = download_link(rna_waldec)
return rna_waldec


if __name__ == "__main__":
print("Searching for lastest rna waldec version")
path = search_and_download_data()
folder = path.replace(".zip", "")
print("extracting rna data")
unzip_and_delete(path)
print("delete zip file")
os.remove(path)
folder = "rna_waldec_20241001"
os.system(
f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'")
Loading