From fdd28865065192f429c7b479ad8271b72a21ba6e Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 6 Nov 2024 06:37:44 +0100 Subject: [PATCH] 20241031 - Acteurs inactifs reparation lvao (#990) * initial commit * initial commit * Delete notebooks/20241031_acteurs_inactifs_reparation_lvao_ancien_cma/validation_et_execution_requettes.ipynb * MAJ conforme aux requettes PREPROD et PROD * Clear cells output --------- Co-authored-by: Nicolas Oudard --- ..._inactifs_reparation_lvao_ancien_cma.ipynb | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 notebooks/20241031_acteurs_inactifs_reparation_lvao_ancien_cma.ipynb diff --git a/notebooks/20241031_acteurs_inactifs_reparation_lvao_ancien_cma.ipynb b/notebooks/20241031_acteurs_inactifs_reparation_lvao_ancien_cma.ipynb new file mode 100644 index 000000000..a9b33e521 --- /dev/null +++ b/notebooks/20241031_acteurs_inactifs_reparation_lvao_ancien_cma.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b86432b3", + "metadata": {}, + "source": [ + "# Rendre inactifs 6264 acteurs reparation LVAO (ancien CMA)\n", + "\n", + "[Ticket](https://www.notion.so/accelerateur-transition-ecologique-ademe/Supprimer-les-artisans-Ferm-s-de-la-vieille-source-CMA-non-r-paracteurs-12e6523d57d78045a70ed85ad8a47796)\n", + "\n", + " - **QUOI**: modifier 6264 acteurs LVAO `_reparation_` (anciennement CMA) en changeant `statut` = `INACTIF`\n", + " - **POURQUOI**: les entrées dans la base sont obsolètes\n", + " - **COMMENT**:\n", + " - intégration avec la base SIRENE en local\n", + " - génération d'un export et revue manuelle\n", + " - génération des requettes SQL sur la base de l'export\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "376f5950", + "metadata": {}, + "source": [ + "## Paramètres" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "2524cc6e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from datetime import datetime, timezone\n", + "\n", + "# Base de données\n", + "DB_TEST_URL = os.environ[\"DB_TEST_URL\"]\n", + "DB_PROD_URL = os.environ[\"DB_PROD_URL\"]\n", + "DB_TABLE = \"qfdmo_acteur\"\n", + "DB_TABLE_REVISION = \"qfdmo_revisionacteur\"\n", + "DB_STATUS_FIELD = \"statut\"\n", + "DB_STATUS_VALUE = \"INACTIF\"\n", + "DB_COMMENT_FIELD = \"commentaires\"\n", + "DB_COMMENT_VALUE = \"20241104_acteurs_inactifs_reparation_lvao_ancien_cma\"\n", + "DB_ID_FIELD = \"identifiant_unique\"\n", + "DB_TIMESTAMP_FIELD = \"modifie_le\"\n", + "# on ne touche pas à se qui a été modifié end prod\n", + "# depuis l'export\n", + "DB_TIMESTAMP_SAFEGUARD = datetime.strptime(\"2024-10-29 00:00:00\", \"%Y-%m-%d %H:%M:%S\").replace(tzinfo=timezone.utc)\n", + "\n", + "# File system\n", + "#DIR_CURRENT = os.path.dirname(os.path.abspath(__file__))\n", + "#DIR_DATA = os.path.join(DIR_CURRENT, \"data\")\n", + "\n", + "# Export validé\n", + "DATA_FILEPATH = \"./data/acteurs_a_desactiver.csv\"\n", + "DATA_FILTER_KEY = \"validation\"\n", + "# note: seule un échantillion à été validé donc on garde\n", + "# tout sauf 1 entrée modifiée sur prod entre temps\n", + "DATA_FILTER_EXCLUDE = \"NE PAS TOUCHER\"\n", + "DATA_FILTER_LENGTH = 9653 # on s'attend à modifier 6264 acteurs\n", + "DATA_STATUS_FIELD = \"statut\"\n", + "DATA_STATUS_CURRENT = \"ACTIF\"\n", + "\n", + "# Les champs de la source de données SIRENE nous indiquant\n", + "# que soit l'entreprise est en cessation (C), soit\n", + "# l'établissement est fermé (F)\n", + "DATA_SIREN_STATUS_FIELD = \"etatAdministratifUniteLegale\"\n", + "DATA_SIREN_STATUS_VALUE = \"C\"\n", + "DATA_SIRET_STATUS_FIELD = \"etatAdministratifEtablissement\"\n", + "DATA_SIRET_STATUS_VALUE = \"F\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "c584d66c", + "metadata": {}, + "source": [ + "## Import des librairies" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "1790323e-8d84-4924-a4bc-fc74d186e42f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sqlalchemy import create_engine\n", + "from sqlalchemy.engine.url import make_url\n", + "from rich import print\n", + "from rich.progress import track" + ] + }, + { + "cell_type": "markdown", + "id": "b42d285d", + "metadata": {}, + "source": [ + "## Valider connection à la base de données" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "4819f9ab", + "metadata": {}, + "outputs": [], + "source": [ + "# Create and test engines\n", + "#assert DB_TEST_URL != DB_PROD_URL, \"DB_TEST_URL et DB_PROD_URL doivent être différents\"\n", + "db_test_engine = create_engine(DB_TEST_URL)\n", + "db_prod_engine = create_engine(DB_PROD_URL)\n", + "for engine in [db_test_engine, db_prod_engine]:\n", + " conn = engine.connect()\n", + " conn.execute(f\"SELECT * FROM {DB_TABLE} LIMIT 1\")\n", + " conn.close()" + ] + }, + { + "cell_type": "markdown", + "id": "0bf6cc64", + "metadata": {}, + "source": [ + "## Lecture et validation de la data locale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61913a44", + "metadata": {}, + "outputs": [], + "source": [ + "# The default \"df\" corresponds to our local data\n", + "df = pd.read_csv(DATA_FILEPATH).replace({np.nan: None})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98545c5", + "metadata": {}, + "outputs": [], + "source": [ + "# On filtre les acteurs à modifier\n", + "filter_validation = df[DATA_FILTER_KEY] != DATA_FILTER_EXCLUDE\n", + "filter_siren = df[DATA_SIREN_STATUS_FIELD] == DATA_SIREN_STATUS_VALUE\n", + "filter_siret = df[DATA_SIRET_STATUS_FIELD] == DATA_SIRET_STATUS_VALUE\n", + "\n", + "df = df[filter_validation & (filter_siren | filter_siret)]\n", + "print(f\"Nombre d'acteurs à modifier: {len(df)}\")\n", + "assert len(df) == DATA_FILTER_LENGTH, f\"On devrait avoir {DATA_FILTER_LENGTH} acteurs à modifier\"" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "8bbba318", + "metadata": {}, + "outputs": [], + "source": [ + "# On s'assure qu'ils étaient tous actifs\n", + "assert all(df[DATA_STATUS_FIELD] == DATA_STATUS_CURRENT), \"Tous les acteurs devraient être actifs\"\n", + "# Qu'on avait pas de doublon sur le champ identifiant\n", + "assert len(df) == len(df[DB_ID_FIELD].unique()), f\"Présence de doublons sur le champ {DB_ID_FIELD}\"" + ] + }, + { + "cell_type": "markdown", + "id": "c7038246", + "metadata": {}, + "source": [ + "## Logique de modification de la base" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "dd92d530", + "metadata": {}, + "outputs": [], + "source": [ + "def sql_acteurs_select(table_name: str, ids: list):\n", + " \"\"\"Construit une requête SQL pour récupérer des acteurs matchant des IDs\"\"\"\n", + " return f\"\"\"SELECT {DB_ID_FIELD}, {DB_TIMESTAMP_FIELD}\n", + " FROM {table_name}\n", + " WHERE {DB_ID_FIELD} IN ({', '.join([\"'\"+str(i)+\"'\" for i in ids])})\"\"\"\n", + "\n", + "def db_update_from_df(df: pd.DataFrame, engine):\n", + " \"\"\"MAJ de la DB à partir d'un DataFrame source d'acteurs\"\"\"\n", + " url = make_url(engine.url)\n", + " print(\"db_update_from_df:\")\n", + " print({\"hostname\": url.host,\"port\":url.port, \"database\": url.database})\n", + "\n", + " # On récupère les IDs de la dataframe source\n", + " ids = df[DB_ID_FIELD].tolist()\n", + "\n", + " # On récupère les acteurs vs. révisions existants et on s'assure du match\n", + " df_act = pd.read_sql(con=engine, sql=sql_acteurs_select(DB_TABLE, ids))\n", + " df_rev = pd.read_sql(con=engine, sql=sql_acteurs_select(DB_TABLE_REVISION, ids))\n", + " print({\"# acteurs\": len(df_act), \"# révisions existantes\": len(df_rev)})\n", + " assert len(df_act) == len(df_rev), \"Certaines révisions manquantes, à créer...\"\n", + "\n", + " # On met à jour les révisions\n", + " print(\"MAJ des révisions: début...\")\n", + " count=0\n", + " for index, row in df_rev.iterrows():\n", + " sql_update = f\"\"\"\n", + " UPDATE {DB_TABLE_REVISION}\n", + " SET {DB_STATUS_FIELD} = '{DB_STATUS_VALUE}',\n", + " {DB_COMMENT_FIELD} = '{DB_COMMENT_VALUE}',\n", + " {DB_TIMESTAMP_FIELD} = NOW()\n", + " WHERE {DB_ID_FIELD} = '{row[DB_ID_FIELD]}'\"\"\"\n", + " if index==0:\n", + " print(sql_update)\n", + " engine.execute(sql_update)\n", + " count+=1\n", + " if count%500==0:\n", + " print(count)\n", + " print(\"MAJ des révisions: fin\")\n", + " print(\"# entrées modifiées:\", count)" + ] + }, + { + "cell_type": "markdown", + "id": "55954389", + "metadata": {}, + "source": [ + "## Modification de la base" + ] + }, + { + "cell_type": "markdown", + "id": "e2f1162c", + "metadata": {}, + "source": [ + "### Test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbf322b2", + "metadata": {}, + "outputs": [], + "source": [ + "db_update_from_df(df, db_test_engine)" + ] + }, + { + "cell_type": "markdown", + "id": "13900051", + "metadata": {}, + "source": [ + "## Production" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "19f3f7d2", + "metadata": {}, + "outputs": [], + "source": [ + "raise Exception(\"Sur la point de modifier la base de prod, retirer cette ligne pour continuer\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe72d881", + "metadata": {}, + "outputs": [], + "source": [ + "#db_update_from_df(df, db_prod_engine)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}