testing and egt

eqasim-org · Mar 18, 2024 · 4d4e5bf · 4d4e5bf
1 parent d21ccc7
commit 4d4e5bf
Show file tree

Hide file tree

Showing 7 changed files with 134 additions and 41 deletions.
diff --git a/data/hts/egt/cleaned.py b/data/hts/egt/cleaned.py
@@ -10,6 +10,9 @@
 def configure(context):
     context.stage("data.hts.egt.raw")
 
+    if context.config("use_urban_type", False):
+        context.stage("data.spatial.urban_type")
+
 INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6]
 
 PURPOSE_MAP = {
@@ -111,6 +114,24 @@ def execute(context):
     df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
     df_households["income_class"] = df_households["income_class"].astype(int)
 
+    # Impute urban type
+    if context.config("use_urban_type"):
+        df_urban_type = context.stage("data.spatial.urban_type")[[
+            "commune_id", "urban_type"
+        ]]
+
+        # Household municipality
+        df_households["commune_id"] = df_households["RESCOMM"].astype("category")
+        df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left")
+        assert np.all(~df_persons["commune_id"].isna())
+
+        # Impute urban type
+        df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left")
+        df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category")
+
+        df_households.drop(columns = ["commune_id"])
+        df_persons.drop(columns = ["commune_id"])
+
     # Trip purpose
     df_trips["following_purpose"] = "other"
     df_trips["preceding_purpose"] = "other"

diff --git a/data/hts/egt/filtered.py b/data/hts/egt/filtered.py
@@ -12,7 +12,6 @@ def configure(context):
 
 def execute(context):
     df_codes = context.stage("data.spatial.codes")
-    assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense
 
     df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")
 
@@ -39,9 +38,15 @@ def execute(context):
     df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]
 
     # Finish up
-    df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]]
-    df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]]
-    df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]]
+    household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
+    df_households = df_households[household_columns]
+
+    person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]
+    if "urban_type" in df_persons: person_columns.append("urban_type")
+    df_persons = df_persons[person_columns]
+
+    trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]
+    df_trips = df_trips[trip_columns]
 
     hts.check(df_households, df_persons, df_trips)
 

diff --git a/data/spatial/urban_type.py b/data/spatial/urban_type.py
@@ -23,7 +23,7 @@ def configure(context):
     context.stage("data.spatial.municipalities")
 
     context.config("data_path")
-    context.config("urban_type_path", "uu/UU2020_au_01-01-2023.zip")
+    context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip")
 
 def execute(context):
     with zipfile.ZipFile("{}/{}".format(
@@ -33,7 +33,7 @@ def execute(context):
             df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5)
 
     df = df[["CODGEO", "STATUT_2017"]].copy()
-    df = df.set_axis(["commune_id", "type_uu"], axis = "columns")
+    df = df.set_axis(["commune_id", "urban_type"], axis = "columns")
 
     # Cities that have districts are not detailed in the UU file, only the whole city is mentioned
     # However the municipalities file details the districts with their respective INSEE codes
@@ -43,21 +43,27 @@ def execute(context):
 
     # Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts
     for city_code in cities_with_districts:
-        uu_type = df[df["commune_id"] == city_code].iloc[0].loc["type_uu"]
-        df.drop(df[df["commune_id"] == city_code].index, inplace=True)
-        new_lines = {"commune_id": [district_id for district_id in cities_with_districts[city_code]],
-                     "type_uu": [uu_type for i in range(len(cities_with_districts[city_code]))]}
-        df = pd.concat([df, pd.DataFrame.from_dict(new_lines)])
+        base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"]
+        replacement_codes = cities_with_districts[city_code]
+
+        df = pd.concat([df, pd.DataFrame({
+            "commune_id": replacement_codes,
+            "urban_type": [base_type] * len(replacement_codes)
+        })])
+
+    df = df[~df["commune_id"].isin(cities_with_districts.keys())]
 
     # Clean unités urbaines
-    df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
-    assert np.all(~df["type_uu"].isna())
-    df["type_uu"] = df["type_uu"].astype("category")
+    df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
+    assert np.all(~df["urban_type"].isna())
+    df["urban_type"] = df["urban_type"].astype("category")
 
     df_municipalities = context.stage("data.spatial.municipalities")
     requested_communes = set(df_municipalities["commune_id"].unique())
     df = df[df["commune_id"].isin(requested_communes)]
-
+
+    assert len(df["commune_id"].unique()) == len(df)
+
     return df
 
 def validate(context):

diff --git a/synthesis/population/matched.py b/synthesis/population/matched.py
@@ -20,7 +20,8 @@
 }
 
 DEFAULT_MATCHING_ATTRIBUTES = [
-    "sex", "any_cars", "age_class", "socioprofessional_class"
+    "sex", "any_cars", "age_class", "socioprofessional_class",
+    "departement_id"
 ]
 
 def configure(context):
@@ -117,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ
 
     progress.update(np.count_nonzero(unassigned_mask))
 
+    if np.count_nonzero(unassigned_mask) > 0:
+        raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?")
+
     assert np.count_nonzero(unassigned_mask) == 0
     assert np.count_nonzero(assigned_indices == -1) == 0
 
@@ -174,8 +178,7 @@ def execute(context):
 
     try:
         default_index = columns.index("*default*")
-        del columns[default_index]
-        columns.insert(default_index, DEFAULT_MATCHING_ATTRIBUTES)
+        columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
     except ValueError: pass
 
     # Define matching attributes
@@ -199,9 +202,12 @@ def execute(context):
     df_source = df_source.rename(columns = { "person_id": "hts_id" })
 
     for column in columns:
-        assert column in df_source
-        assert column in df_target
-
+        if not column in df_source:
+            raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column))
+
+        if not column in df_target:
+            raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column))
+
     df_assignment, levels = parallel_statistical_matching(
         context,
         df_source, "hts_id", "person_weight",

diff --git a/tests/test_determinism.py b/tests/test_determinism.py
@@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir):
         regions = [10, 11], sampling_rate = 1.0, hts = "entd",
         random_seed = 1000, processes = 1,
         secloc_maximum_iterations = 10,
-        maven_skip_tests = True
+        maven_skip_tests = True,
+        matching_attributes = [
+            "sex", "any_cars", "age_class", "socioprofessional_class",
+            "income_class", "departement_id"
+        ]
     )
 
     stages = [
@@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir):
         regions = [10, 11], sampling_rate = 1.0, hts = "entd",
         random_seed = 1000, processes = 1,
         secloc_maximum_iterations = 10,
-        maven_skip_tests = True
+        maven_skip_tests = True,
+        matching_attributes = [
+            "sex", "any_cars", "age_class", "socioprofessional_class",
+            "income_class", "departement_id"
+        ]
     )
 
     stages = [

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -2,6 +2,7 @@
 import os
 import hashlib
 from . import testdata
+import pandas as pd
 
 def test_data(tmpdir):
     data_path = str(tmpdir.mkdir("data"))
@@ -34,7 +35,7 @@ def test_data(tmpdir):
     assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path)
     assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path)
 
-def run_population(tmpdir, hts, mode_choice):
+def run_population(tmpdir, hts, update = {}):
     data_path = str(tmpdir.mkdir("data"))
     testdata.create(data_path)
 
@@ -45,9 +46,9 @@ def run_population(tmpdir, hts, mode_choice):
         regions = [10, 11], sampling_rate = 1.0, hts = hts,
         random_seed = 1000, processes = 1,
         secloc_maximum_iterations = 10,
-        maven_skip_tests = True,
-        mode_choice = mode_choice
+        maven_skip_tests = True
     )
+    config.update(update)
 
     stages = [
         dict(descriptor = "synthesis.output"),
@@ -62,11 +63,33 @@ def run_population(tmpdir, hts, mode_choice):
     assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path)
     assert os.path.isfile("%s/ile_de_france_meta.json" % output_path)
 
+    assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";"))
+    assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";"))
+    assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";"))
+
 def test_population_with_entd(tmpdir):
-    run_population(tmpdir, "entd", False)
+    run_population(tmpdir, "entd")
+
+def test_population_with_egt(tmpdir):
+    run_population(tmpdir, "egt")
 
 def test_population_with_mode_choice(tmpdir):
-    run_population(tmpdir, "entd", True)
+    run_population(tmpdir, "entd", { "mode_choice": True })
+
+def test_population_with_urban_type(tmpdir):
+    run_population(tmpdir, "entd", { 
+        "use_urban_type": True, 
+        "matching_attributes": [
+            "urban_type", "*default*"
+        ],
+        "matching_minimum_observations": 5
+    })
 
-#def test_population_with_egt(tmpdir):
-#    run_population(tmpdir, "entd") # TODO: Fix this!
+def test_population_with_urban_type_and_egt(tmpdir):
+    run_population(tmpdir, "egt", { 
+        "use_urban_type": True, 
+        "matching_attributes": [
+            "urban_type", "*default*"
+        ],
+        "matching_minimum_observations": 5
+    })
diff --git a/tests/testdata.py b/tests/testdata.py
@@ -301,7 +301,7 @@ def create(output_path):
                 "De 1 000", "De 1 200", "De 1 500", "De 1800",
                 "De 2 000", "De 2 500", "De 3 000", "De 4 000",
                 "De 6 000", "10 000"
-            ]), numcom_UU2010 = random.choice(["B", "C", "I", "R"])
+            ]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4]
         ))
 
         for person_index in range(HTS_HOUSEHOLD_MEMBERS):
@@ -388,8 +388,9 @@ def create(output_path):
         trips = []
     )
 
+    person_index = 0
     for household_index in range(HTS_HOUSEHOLDS):
-        household_id = household_index
+        household_id = household_index * 1000 + 50
 
         municipality = random.choice(df["municipality"].unique())
         region = df[df["municipality"] == municipality]["region"].values[0]
@@ -402,8 +403,7 @@ def create(output_path):
             MNP = 3, REVENU = random.randint(12)
         ))
 
-        for person_index in range(HTS_HOUSEHOLD_MEMBERS):
-            person_id = household_id * 1000 + person_index
+        for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1):
             studies = random.random_sample() < 0.3
 
             data["persons"].append(dict(
@@ -421,15 +421,15 @@ def create(output_path):
             work_region = df[df["municipality"] == work_municipality]["region"].values[0]
             work_department = df[df["municipality"] == work_municipality]["department"].values[0]
 
-            purpose = 21 if studies else 11
+            purpose = 4 if studies else 2
             mode = random.choice([1, 2, 3, 5, 7])
 
             origin_hour = 8
             origin_minute = 0
 
             if person_index % 100 == 0:
                 # Testing proper diffusion of plan times
-                orign_hour = 0
+                origin_hour = 0
                 origin_minute = 12
 
             data["trips"].append(dict(
@@ -442,18 +442,27 @@ def create(output_path):
 
             data["trips"].append(dict(
                 NQUEST = household_id, NP = person_id,
-                ND = 1, ORDEP = work_department, DESTDEP = home_department,
+                ND = 2, ORDEP = work_department, DESTDEP = home_department,
                 ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality,
                 DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
-                DESTMOT_H9 = 31, ORMOT_H9 = purpose
+                DESTMOT_H9 = 5, ORMOT_H9 = purpose
             ))
 
             data["trips"].append(dict(
                 NQUEST = household_id, NP = person_id,
-                ND = 2, ORDEP = home_department, DESTDEP = home_department,
+                ND = 3, ORDEP = home_department, DESTDEP = home_department,
                 ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality,
                 DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
-                DESTMOT_H9 = 1, ORMOT_H9 = 31
+                DESTMOT_H9 = 1, ORMOT_H9 = 5
+            ))
+
+            # Tail
+            data["trips"].append(dict(
+                NQUEST = household_id, NP = person_id,
+                ND = 4, ORDEP = home_department, DESTDEP = home_department,
+                ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality,
+                DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
+                DESTMOT_H9 = 5, ORMOT_H9 = 1
             ))
 
     os.mkdir("%s/egt_2010" % output_path)
@@ -657,7 +666,22 @@ def create(output_path):
 
     df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'})
 
-
+    # Data set: Urban type
+    print("Creating urban type ...")
+    df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" })
+    df_urban_type = df_urban_type.drop_duplicates()
+    df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))]
+
+    df_urban_type = pd.concat([df_urban_type, pd.DataFrame({
+        "CODGEO": ["75056", "69123", "13055"],
+        "STATUT_2017": ["C", "C", "C"]
+    })])
+
+    os.mkdir("%s/urban_type" % output_path)
+    with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive:
+        with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f:
+            df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False)
+
     # Data set: OSM
     # We add add a road grid of 500m
     print("Creating OSM ...")