fix: separate analysis from data output & update docs

eqasim-org · Oct 23, 2024 · a01f600 · a01f600
1 parent 9dd5df2
commit a01f600
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 22 deletions.
diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
@@ -2,8 +2,9 @@
 import os
 import numpy as np
 import pandas as pd
+import geopandas as gpd
 from  analysis.marginals import NUMBER_OF_VEHICLES_LABELS
-
+from shapely import distance
 AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf]
 NUMBER_OF_VEHICLES= [0,1,2,3,np.inf]
 NAME_AGE_CLASS = ["0-10","11-14","15-17","18-25","26-50","51-65","65+"]
@@ -13,8 +14,10 @@ def configure(context):
     context.config("output_path")
     context.config("output_prefix", "ile_de_france_")
     context.config("sampling_rate")
+
     context.stage("synthesis.population.trips")
     context.stage("synthesis.population.enriched")
+    context.stage("synthesis.population.spatial.locations")
 
     context.stage("data.census.filtered", alias = "census")
     context.stage("data.hts.selected", alias = "hts")
@@ -30,7 +33,12 @@ def execute(context):
     sampling_rate = context.config("sampling_rate")
     df_person_eq = context.stage("synthesis.population.enriched")
     df_trip_eq = context.stage("synthesis.population.trips")
-
+    df_location_eq = context.stage("synthesis.population.spatial.locations")[["person_id", "activity_index", "geometry"]]
+
+    df_location_eq = df_location_eq.to_crs("EPSG:2154")
+    df_trip_eq["preceding_activity_index"] = df_trip_eq["trip_index"]
+    df_trip_eq["following_activity_index"] = df_trip_eq["trip_index"] + 1
+
     df_census = context.stage("census")
     df_hts_households, df_hts_person, df_hts_trip = context.stage("hts")
     df_hts_person["person_weight"] *=df_census["weight"].sum()/df_hts_person["person_weight"].sum()
@@ -47,6 +55,8 @@ def execute(context):
 
     df_eq_travel = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
     df_hts_travel = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"])
+
+    print("Generate tables ...")
     # Age purpose analysis
     analysis_age_purpose = pd.pivot_table(df_eq_travel,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
     analysis_age_purpose = analysis_age_purpose/sampling_rate
@@ -86,9 +96,26 @@ def execute(context):
     # Compare distance
     df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000 if "routed_distance" in  df_hts_travel.columns else df_hts_travel["euclidean_distance"]/1000
     df_hts_travel["distance_class"] = pd.cut(df_hts_travel["routed_distance"],list(np.arange(100))+[np.inf])
-    analysis_dist = df_hts_travel.groupby("distance_class")["person_weight"].sum()
 
-    return analysis_dist
+    df_spatial = pd.merge(df_trip_eq, df_location_eq.rename(columns = {
+        "activity_index": "preceding_activity_index",
+        "geometry": "preceding_geometry"
+    }), how = "left", on = ["person_id", "preceding_activity_index"])
+
+    df_spatial = pd.merge(df_spatial, df_location_eq.rename(columns = {
+        "activity_index": "following_activity_index",
+        "geometry": "following_geometry"
+    }), how = "left", on = ["person_id", "following_activity_index"])
+    df_spatial["distance"] = df_spatial.apply(lambda x:distance( x["preceding_geometry"],x["following_geometry"])/1000,axis=1)
+    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
+
+    analysis_distance = pd.concat([df_hts_travel.groupby("distance_class")["person_weight"].sum(),df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
+    analysis_distance.columns = ["Distance class","HTS","EQASIM"]
+    analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
+    analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
+    analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ sampling_rate
+    analysis_distance.to_csv(f"{analysis_output_path}/{prefix}distance.csv")
+
 
 
 

diff --git a/docs/population.md b/docs/population.md
@@ -450,3 +450,11 @@ folder as: `{output_prefix}_{age group}_{trip pupose}.html`
 
 Note:
 With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs, or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced.
+
+
+### Comparaison population to source data
+
+Using the population pipeline in the Analysis directory, you can generate multiple tables comparing composition of synthetic population to source data. Right now the tables generated compare : population volume by age range, households volume by number of vehicles, population volume with a license and without, trip volume by age range and trip volume by length.
+Complementary from the synthetic population only, a table of population volume by age range and trip purpose is also created.
+
+To be able to use this pipeline, you must already have create a synthetic population. Then you need to open the `config.yml` and add the `analysis.synthesis.population` stage in the `run` section. 
diff --git a/synthesis/output.py b/synthesis/output.py
@@ -18,7 +18,7 @@ def configure(context):
     context.stage("synthesis.vehicles.vehicles")
 
     context.stage("synthesis.population.spatial.locations")
-    context.stage("analysis.synthesis.population")
+
     context.stage("documentation.meta_output")
 
     context.config("output_path")
@@ -273,20 +273,4 @@ def execute(context):
         clean_gpkg(path)
     if "geoparquet" in output_formats:
         path = "%s/%strips.geoparquet" % (output_path, output_prefix)
-        df_spatial.to_parquet(path)
-
-    # Output population analysis
-    SAMPLING_RATE =context.config("sampling_rate")
-    df_spatial = df_spatial.to_crs("EPSG:2154")
-
-    df_spatial["distance"] = df_spatial.length/1000
-    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
-
-    # Compare distance 
-    analysis_distance = context.stage("analysis.synthesis.population")
-    analysis_distance = pd.concat([analysis_distance,df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
-    analysis_distance.columns = ["Distance class","HTS","EQASIM"]
-    analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
-    analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
-    analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ SAMPLING_RATE
-    analysis_distance.to_csv(f"{output_path}/{ANALYSIS_FOLDER}/{output_prefix}distance.csv")
+        df_spatial.to_parquet(path)