Skip to content

Commit

Permalink
feat: verify availability of open data (#226)
Browse files Browse the repository at this point in the history
  • Loading branch information
sebhoerl authored Apr 19, 2024
1 parent 1adeca4 commit b0c45cc
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ jobs:
shell: bash -el {0}
run: |
MKL_CBWR=AUTO pytest tests/
- name: Verify availability of open data
if: matrix.os == 'ubuntu-latest'
shell: bash -el {0}
run: |
python scripts/verify_data.py
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

**Under development**

- feat: check availability of open data sources for every PR
- feat: make statistical matching attribute list configurable
- feat: add urban type classifiation (unité urbaine)
- feat: functionality to make use of INSEE population projection data
Expand Down
135 changes: 135 additions & 0 deletions scripts/verify_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import requests

# The goal of this script is to verify the availability of the data
# that is needed to set up the pipeline

class Report:
def __init__(self):
self.sources = []

def register(self, name, url):
self.sources.append({ "name": name, "url": url })

def validate(self):
failed = []

for index, source in enumerate(self.sources):
print("[{}/{}] Checking {} ...".format(index + 1, len(self.sources), source["name"]))

try:
response = requests.head(source["url"])
source["status"] = response.status_code
except:
source["status"] = "error"

print(" Status {}".format(source["status"]))

if source["status"] != 200:
failed.append(source["name"])

print("Done.")
print("Missing: ", len(failed))
print(failed)

return len(failed) == 0

report = Report()

report.register(
"Census data (RP 2019)",
"https://www.insee.fr/fr/statistiques/fichier/6544333/RP2019_INDCVI_csv.zip"
)

report.register(
"Population totals (RP 2019)",
"https://www.insee.fr/fr/statistiques/fichier/6543200/base-ic-evol-struct-pop-2019.zip"
)

report.register(
"Origin-destination data (RP-MOBPRO 2019)",
"https://www.insee.fr/fr/statistiques/fichier/6456056/RP2019_mobpro_csv.zip"
)

report.register(
"Origin-destination data (RP-MOBSCO 2019)",
"https://www.insee.fr/fr/statistiques/fichier/6456052/RP2019_mobsco_csv.zip"
)

report.register(
"Income tax data (Filosofi 2019), municipalities",
"https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-COMMUNES.zip"
)

report.register(
"Income tax data (Filosofi 2019), administrative",
"https://www.insee.fr/fr/statistiques/fichier/6036907/indic-struct-distrib-revenu-2019-SUPRA.zip"
)

report.register(
"Service and facility census (BPE 2021)",
"https://www.insee.fr/fr/statistiques/fichier/3568638/bpe21_ensemble_xy_csv.zip"
)

entd_sources = [
(2339, "Q_tcm_menage_0"),
(2555, "Q_tcm_individu"),
(2556, "Q_menage"),
(2565, "Q_individu"),
(2566, "Q_ind_lieu_teg"),
(2568, "K_deploc")
]

for identifier, name in entd_sources:
report.register(
"National household travel survey (ENTD 2008), {}".format(name),
"https://www.statistiques.developpement-durable.gouv.fr/media/{}/download?inline".format(identifier)
)

report.register(
"IRIS zoning system (2021)",
"https://data.geopf.fr/telechargement/download/CONTOURS-IRIS/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01/CONTOURS-IRIS_2-1__SHP__FRA_2021-01-01.7z"
)

report.register(
"Zoning registry (2021)",
"https://www.insee.fr/fr/statistiques/fichier/7708995/reference_IRIS_geo2021.zip"
)

report.register(
"Enterprise census (SIRENE), Etablissement",
"https://files.data.gouv.fr/insee-sirene/StockEtablissement_utf8.zip"
)

report.register(
"Enterprise census (SIRENE), Unité Legale",
"https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip"
)

report.register(
"Enterprise census (SIRENE), Géolocalisé",
"https://files.data.gouv.fr/insee-sirene-geo/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip"
)

for department in (75, 77, 78, 91, 92, 93, 94, 95):
report.register(
"Buildings database (BD TOPO), {}".format(department),
"https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15/BDTOPO_3-0_TOUSTHEMES_GPKG_LAMB93_D0{}_2022-03-15.7z".format(department, department)
)

for department in (75, 77, 78, 91, 92, 93, 94, 95):
report.register(
"Adresses database (BAN), {}".format(department),
"https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{}.csv.gz".format(department)
)

report.register(
"Population projections",
"https://www.insee.fr/fr/statistiques/fichier/5894093/00_central.xlsx"
)

report.register(
"Urban type",
"https://www.insee.fr/fr/statistiques/fichier/4802589/UU2020_au_01-01-2023.zip"
)

exit(0 if report.validate() else 1)

0 comments on commit b0c45cc

Please sign in to comment.