categorisation education : new distribution of education od with age …

…range
tellae · Jul 25, 2024 · 30709d3 · 30709d3
1 parent 569f840
commit 30709d3
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 20 deletions.
diff --git a/data/od/cleaned.py b/data/od/cleaned.py
@@ -58,12 +58,22 @@ def execute(context):
 
     assert not np.any(df_work["commute_mode"].isna())
 
+    # Clean age range for education
+    df_education["age_range"] = np.nan
+    df_education.loc[df_education["AGEREV10"] <= 6, "age_range"] = "primary_school"
+    df_education.loc[df_education["AGEREV10"] == 11, "age_range"] = "middle_school"
+    df_education.loc[df_education["AGEREV10"] == 15, "age_range"] = "high_school"
+    df_education.loc[df_education["AGEREV10"] >= 18, "age_range"] = "higher_education"
+    df_education["age_range"] = df_education["age_range"].astype("category")
+
+    assert not np.any(df_education["age_range"].isna())
+
     # Aggregate the flows
     print("Aggregating work ...")
     df_work = df_work.groupby(["origin_id", "destination_id", "commute_mode"])["weight"].sum().reset_index()
 
     print("Aggregating education ...")
-    df_education = df_education.groupby(["origin_id", "destination_id"])["weight"].sum().reset_index()
+    df_education = df_education.groupby(["origin_id", "destination_id","age_range"])["weight"].sum().reset_index()
 
     df_work["weight"] = df_work["weight"].fillna(0.0)
     df_education["weight"] = df_education["weight"].fillna(0.0)

diff --git a/data/od/raw.py b/data/od/raw.py
@@ -57,7 +57,8 @@ def execute(context):
             "COMMUNE":"str", 
             "ARM":"str", 
             "IPONDI":"float",
-            "DCETUF":"str"
+            "DCETUF":"str",
+            "AGEREV10":"int"
         }
 
         with zipfile.ZipFile(

diff --git a/data/od/weighted.py b/data/od/weighted.py
@@ -12,20 +12,24 @@
 def configure(context):
     context.stage("data.od.cleaned")
     context.stage("data.spatial.codes")
+    context.config("output_path")
+    context.config("education_location_source","bpe")
 
-def fix_origins(df, commune_ids, purpose):
+def fix_origins(df, commune_ids, purpose,category): 
     existing_ids = set(np.unique(df["origin_id"]))
     missing_ids = commune_ids - existing_ids
+    categories = set(np.unique(df[category]))
 
     rows = []
     for origin_id in missing_ids:
         for destination_id in commune_ids:
-            rows.append((origin_id, destination_id, 1.0 if origin_id == destination_id else 0.0))
+            for category_name in categories :
+                rows.append((origin_id, destination_id, category_name, 1.0/len(categories) if origin_id == destination_id else 0.0))
 
     print("Fixing %d origins for %s" % (len(missing_ids), purpose))
 
     return pd.concat([df, pd.DataFrame.from_records(
-        rows, columns = ["origin_id", "destination_id", "weight"]
+        rows, columns = ["origin_id", "destination_id", category, "weight"]
     )]).sort_values(["origin_id", "destination_id"])
 
 def execute(context):
@@ -35,25 +39,29 @@ def execute(context):
     # Load data
     df_work, df_education = context.stage("data.od.cleaned")
 
-    # Aggregate work (we do not consider different modes at the moment)
-    df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
-
     # Add missing origins
-    df_work = fix_origins(df_work, commune_ids, "work")
-    df_education = fix_origins(df_education, commune_ids, "education")
+    df_work = fix_origins(df_work, commune_ids, "work","commute_mode")
+    df_education = fix_origins(df_education, commune_ids, "education","age_range")
 
+    # Aggregate work (we do not consider different modes at the moment)
+    df_work = df_work[["origin_id", "destination_id", "weight"]].groupby(["origin_id", "destination_id"]).sum().reset_index()
+
     # Compute totals
     df_total = df_work[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
     df_work = pd.merge(df_work, df_total, on = "origin_id")
 
-    df_total = df_education[["origin_id", "weight"]].groupby("origin_id").sum().reset_index().rename({ "weight" : "total" }, axis = 1)
-    df_education = pd.merge(df_education, df_total, on = "origin_id")
-
+    df_total = df_education[["origin_id","age_range", "weight"]].groupby(["origin_id","age_range"]).sum().reset_index().rename({ "weight" : "total" }, axis = 1)
+    df_education = pd.merge(df_education, df_total, on = ["origin_id","age_range"])
+
+    if context.config("education_location_source") == 'bpe':
+        # Aggregate education (we do not consider different age range with bpe source)
+        df_education = df_education[["origin_id", "destination_id", "weight","total"]].groupby(["origin_id", "destination_id"]).sum().reset_index()    
     # Compute weight
     df_work["weight"] /= df_work["total"]
     df_education["weight"] /= df_education["total"]
 
     del df_work["total"]
     del df_education["total"]
-
+    df_education = df_education.fillna(0.0)
+
     return df_work, df_education
diff --git a/synthesis/locations/education.py b/synthesis/locations/education.py
@@ -77,13 +77,12 @@ def execute(context):
             df_education["fake"] = False
             df_education = df_education.to_crs("2154")
             list_type = set(df_education["TYPEQU"].unique())
-            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].isin(list_type))],df_education])
+            df_locations = pd.concat([df_locations[~(df_locations["TYPEQU"].str.startswith(tuple(list_type)))],df_education[df_education["commune_id"].isin(required_communes)]])
 
 
         # Add education destinations in function of level education
         for c in ["C1", "C2", "C3"]:
-            missing_communes = required_communes - set(
-                df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
+            missing_communes = required_communes - set(df_locations[df_locations["TYPEQU"].str.startswith(c)]["commune_id"].unique())
 
             if len(missing_communes) > 0:
                 df_locations = pd.concat([df_locations,fake_education(missing_communes, c, df_locations, df_zones)])
@@ -93,7 +92,7 @@ def execute(context):
 
         if len(missing_communes) > 0:
 
-            df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
+           df_locations = pd.concat([df_locations,fake_education(missing_communes, "C4", df_locations, df_zones)])
     else :
         missing_communes = required_communes - set(df_locations["commune_id"].unique())
         if len(missing_communes) > 0:

diff --git a/synthesis/population/spatial/primary/candidates.py b/synthesis/population/spatial/primary/candidates.py
@@ -10,7 +10,7 @@ def configure(context):
     context.stage("synthesis.population.spatial.home.zones")
     context.stage("synthesis.population.enriched")
     context.stage("synthesis.population.trips")
-
+    context.config("output_path")
     context.config("random_seed")
     context.config("education_location_source", "bpe")
 
@@ -149,7 +149,7 @@ def execute(context):
             df_education.append(
                 process(context, "education_" + prefix, random,
                     df_persons[df_persons["age"].between( education_type["min_age"],education_type["max_age"])],
-                    df_education_od,df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
+                    df_education_od[df_education_od["age_range"]==prefix],df_locations[df_locations["TYPEQU"].str.startswith(education_type["type_edu"])])
             )
         df_education = pd.concat(df_education).sort_values(["origin_id", "destination_id"])