feat: use future population projections (#210)

eqasim-org · Feb 14, 2024 · f5f3666 · f5f3666
1 parent e0d49e7
commit f5f3666
Show file tree

Hide file tree

Showing 10 changed files with 289 additions and 62 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 **Under development**
 
+- feat: functionality to make use of INSEE population projection data
+- update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases
+- fix bug where always one household_id existed twice
 - Fix read order when exploring files using `glob`
 - Modes are only written now to `trips.csv` if `mode_choice` is activated
 - Update to `eqasim-java` commit `7cbe85b`

diff --git a/data/census/cleaned.py b/data/census/cleaned.py
@@ -26,7 +26,7 @@ def execute(context):
 
     # Fill up undefined household ids (those where NUMMI == Z)
     f = np.isnan(df["household_id"])
-    df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max()
+    df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1
     df["household_id"] = df["household_id"].astype(int)
 
     # Put person IDs
@@ -48,17 +48,6 @@ def execute(context):
     df.loc[f_undefined, "iris_id"] = "undefined"
     df["iris_id"] = df["iris_id"].astype("category")
 
-    # Verify with requested codes
-    df_codes = context.stage("data.spatial.codes")
-
-    excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique())
-    if not excess_communes == {"undefined"}:
-        raise RuntimeError("Found additional communes: %s" % excess_communes)
-
-    excess_iris = set(df["iris_id"].unique()) - set(df_codes["iris_id"].unique())
-    if not excess_iris == {"undefined"}:
-        raise RuntimeError("Found additional IRIS: %s" % excess_iris)
-
     # Age
     df["age"] = df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int)
 
@@ -104,10 +93,6 @@ def execute(context):
     # Socioprofessional category
     df["socioprofessional_class"] = df["CS1"].astype(int)
 
-    # Place of work or education
-    df["work_outside_region"] = df["ILT"].isin(("4", "5", "6"))
-    df["education_outside_region"] = df["ILETUD"].isin(("4", "5", "6"))
-
     # Consumption units
     df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id")
 
@@ -117,6 +102,5 @@ def execute(context):
         "age", "sex", "couple",
         "commute_mode", "employed",
         "studies", "number_of_vehicles", "household_size",
-        "work_outside_region", "education_outside_region",
         "consumption_units", "socioprofessional_class"
     ]]
diff --git a/data/census/filtered.py b/data/census/filtered.py
@@ -14,38 +14,18 @@ def configure(context):
 def execute(context):
     df = context.stage("data.census.cleaned")
 
-    # We remove people who study or work in another region
-    f = df["work_outside_region"] | df["education_outside_region"]
-    remove_ids = df[f]["household_id"].unique()
-
-    initial_households = len(df["household_id"].unique())
-    removed_households = len(remove_ids)
-
-    initial_persons = len(df["person_id"].unique())
-    removed_persons = np.count_nonzero(df["household_id"].isin(remove_ids))
-
-    # TODO: This filtering is not really compatible with defining multiple regions
-    # or departments. This used to be a filter to avoid people going outside of
-    # Île-de-France, but we should consider removing this filter altogether, or
-    # find some smarter way (e.g. using OD matrices and filter out people in
-    # each municipality by the share of outside workers).
+    # Filter requested codes
     df_codes = context.stage("data.spatial.codes")
 
-    if len(df_codes["region_id"].unique()) > 1:
-        raise RuntimeError("""
-            Multiple regions are defined, so the filtering for people going outside
-            of Île-de-France does not make sense in that case. Consider adjusting the
-            data.census.filtered stage!
-        """)
+    requested_departements = df_codes["departement_id"].unique()
+    df = df[df["departement_id"].isin(requested_departements)]
 
-    print(
-        "Removing %d/%d (%.2f%%) households (with %d/%d persons, %.2f%%) because at least one person is working outside of Île-de-France" % (
-        removed_households, initial_households, 100 * removed_households / initial_households,
-        removed_persons, initial_persons, 100 * removed_persons / initial_persons
-    ))
+    excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique())
+    if not excess_communes == {"undefined"}:
+        raise RuntimeError("Found additional communes: %s" % excess_communes)
 
-    context.set_info("filtered_households_share", removed_households / initial_households)
-    context.set_info("filtered_persons_share", removed_persons / initial_persons)
+    excess_iris = set(df["iris_id"].unique()) - set(df_codes["iris_id"].unique())
+    if not excess_iris == {"undefined"}:
+        raise RuntimeError("Found additional IRIS: %s" % excess_iris)
 
-    df = df[~df["household_id"].isin(remove_ids)]
     return df
diff --git a/data/census/projection.py b/data/census/projection.py
@@ -0,0 +1,75 @@
+import pandas as pd
+import os
+
+"""
+This stage loads and cleans projection data about the French population.
+"""
+
+def configure(context):
+    context.config("data_path")
+    context.config("projection_path", "projection_2021")
+    context.config("projection_scenario", "00_central")
+    context.config("projection_year", None)
+
+def execute(context):
+    source_path = "{}/{}/{}.xlsx".format(
+        context.config("data_path"), 
+        context.config("projection_path"), 
+        context.config("projection_scenario"))
+
+    projection_year = int(context.config("projection_year"))
+
+    df_all = pd.read_excel(
+        source_path, sheet_name = "population", skiprows = 1).iloc[:107]
+
+    df_male = pd.read_excel(
+        source_path, sheet_name = "populationH", skiprows = 1).iloc[:107]
+
+    df_female = pd.read_excel(
+        source_path, sheet_name = "populationF", skiprows = 1).iloc[:107]
+
+    df_male["sex"] = "male"
+    df_female["sex"] = "female"
+
+    assert df_all["Âge au 1er janvier"].iloc[-1] == "Total"
+    assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes"
+    assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes"
+
+    df_sex = pd.concat([
+        df_male.iloc[-1:],
+        df_female.iloc[-1:]
+    ]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]]
+    df_sex.columns = ["sex", "projection"]
+
+    df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1]
+    df_age.columns = ["age", "projection"]
+
+    df_male = df_male[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
+    df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
+
+    df_male.columns = ["age", "sex", "projection"]
+    df_female.columns = ["age","sex", "projection"]
+
+    df_cross = pd.concat([df_male, df_female])
+    df_cross["sex"] = df_cross["sex"].astype("category")
+
+    df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]]
+    df_total.columns = ["projection"]
+
+    return {
+        "total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross
+    }
+
+def validate(context):
+    if context.config("projection_year") is not None:
+        source_path = "{}/{}/{}.xlsx".format(
+            context.config("data_path"), 
+            context.config("projection_path"), 
+            context.config("projection_scenario"))
+
+        if not os.path.exists(source_path):
+            raise RuntimeError("Projection data is not available")
+
+        return os.path.getsize(source_path)
+
+    return 0
diff --git a/data/census/raw.py b/data/census/raw.py
@@ -13,16 +13,16 @@ def configure(context):
     context.config("census_path", "rp_2019/RP2019_INDCVI_csv.zip")
     context.config("census_csv", "FD_INDCVI_2019.csv")
 
+    context.config("projection_year", None)
+
 COLUMNS_DTYPES = {
     "CANTVILLE":"str", 
     "NUMMI":"str", 
     "AGED":"str",
     "COUPLE":"str", 
     "CS1":"str",
     "DEPT":"str", 
-    "ETUD":"str", 
-    "ILETUD":"str",
-    "ILT":"str", 
+    "ETUD":"str",
     "IPONDI":"str", 
     "IRIS":"str",
     "REGION":"str", 
@@ -39,6 +39,9 @@ def execute(context):
 
     requested_departements = df_codes["departement_id"].unique()
 
+    # only pre-filter if we don't need to reweight the census later
+    prefilter_departments = context.config("projection_year") is None
+
     with context.progress(label = "Reading census ...") as progress:
         with zipfile.ZipFile(
             "{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive:
@@ -50,8 +53,9 @@ def execute(context):
 
                 for df_chunk in csv:
                     progress.update(len(df_chunk))
-
-                    df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
+
+                    if prefilter_departments:
+                        df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]
 
                     if len(df_chunk) > 0:
                         df_records.append(df_chunk)

diff --git a/docs/population.md b/docs/population.md
@@ -290,3 +290,26 @@ config:
 ```
 
 Running the pipeline again will add the `mode` colum to the `trips.csv` file and its spatial equivalent.
+
+### Population projections
+
+The pipeline allows to make use of population projections from INSEE up to 2021. The same methodology can also be used to scale down the population. The process takes into account the marginal distribution of sex, age, their combination, and the total number of persons. The census data for the base year (see above) is reweighted according to those marginals using *Iterative Proportional Updating*.
+
+- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/5894093?sommaire=5760764). There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario. 
+- Put the downloaded file into `data/projection_2021`, so you will have the file `data/projection_2021/00_central.xlsx`
+
+Then, activate the projection procedure by defining the projection year in the configuration:
+
+```yaml
+config: 
+  # [...]
+  projection_year: 2030
+```
+
+You may choose any year (past or future) that is contained in the projection scenario Excel file. In case you want to use a different scenario, download the corresponding file, put it into the folder mentioned above, and choose the scenario name via configuration:
+
+```yaml
+config: 
+  # [...]
+  projection_scenario: 00_central
+```
diff --git a/synthesis/population/projection/ipu.py b/synthesis/population/projection/ipu.py
@@ -0,0 +1,131 @@
+import pandas as pd
+import numpy as np
+
+"""
+This stage reweights the census data set according to the projection data for a different year.
+"""
+
+def configure(context):
+    context.stage("data.census.cleaned")
+    context.stage("data.census.projection")
+
+def execute(context):
+    df_census = context.stage("data.census.cleaned")
+    projection = context.stage("data.census.projection")
+
+    # Prepare indexing
+    df_households = df_census[["household_id", "household_size", "weight"]].drop_duplicates("household_id")
+    df_households["household_index"] = np.arange(len(df_households))
+    df_census = pd.merge(df_census, df_households[["household_id", "household_index"]])
+
+    # Obtain weights and sizes as arrays
+    household_weights = df_households["weight"].values
+    household_sizes = df_households["household_size"].values
+
+    # Obtain the attribute levels and membership of attributes for all households
+    attributes = []
+
+    attribute_membership = []
+    attribute_counts = []
+    attribute_targets = []
+
+    # Proccesing age ...
+    df_marginal = projection["age"]
+    for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: age", total = len(df_marginal)):
+        f = df_census["age"] == row["age"]
+
+        if row["age"] == 0:
+            continue # we skip incompatible values for peopel of zero age
+
+        if np.count_nonzero(f) == 0:
+            print("Did not find age:", row["age"])
+
+        else:
+            df_counts = df_census.loc[f, "household_index"].value_counts()
+
+            attribute_targets.append(row["projection"])
+            attribute_membership.append(df_counts.index.values)
+            attribute_counts.append(df_counts.values)
+            attributes.append("age={}".format(row["age"]))
+
+    # Processing sex ...
+    df_marginal = projection["sex"]
+    for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: sex", total = len(df_marginal)):
+        f = df_census["sex"] == row["sex"]
+
+        if np.count_nonzero(f) == 0:
+            print("Did not find sex:", row["sex"])
+
+        else:
+            df_counts = df_census.loc[f, "household_index"].value_counts()
+
+            attribute_targets.append(row["projection"])
+            attribute_membership.append(df_counts.index.values)
+            attribute_counts.append(df_counts.values)
+            attributes.append("sex={}".format(row["sex"]))
+
+    # Processing age x sex ...
+    df_marginal = projection["cross"]
+    for index, row in context.progress(df_marginal.iterrows(), label = "Processing attributes: sex x age", total = len(df_marginal)):
+        f = (df_census["sex"] == row["sex"]) & (df_census["age"] == row["age"])
+
+        if row["age"] == 0:
+            continue
+
+        if np.count_nonzero(f) == 0:
+            print("Did not find values:", row["sex"], row["age"])
+
+        else:
+            df_counts = df_census.loc[f, "household_index"].value_counts()
+
+            attribute_targets.append(row["projection"])
+            attribute_membership.append(df_counts.index.values)
+            attribute_counts.append(df_counts.values)
+            attributes.append("sex={},age={}".format(row["sex"], row["age"]))
+
+    # Processing total ...
+    attribute_targets.append(projection["total"]["projection"].values[0])
+    attribute_membership.append(np.arange(len(household_sizes)))
+    attribute_counts.append(household_sizes)
+    attributes.append("total")
+
+    # Perform IPU to obtain update weights
+    update = np.ones((len(df_households),))
+
+    minimum_factors = []
+    maximum_factors = []
+
+    for iteration in context.progress(range(100), label = "Performing IPU"):
+        factors = []    
+        for k in np.arange(len(attributes)):
+            selection = attribute_membership[k]
+
+            target = attribute_targets[k]
+            current = np.sum(update[selection] * household_weights[selection] * attribute_counts[k])
+
+            factor = target / current
+            factors.append(factor)
+
+            update[selection] *= factor
+
+        minimum_factors.append(np.min(factors))
+        maximum_factors.append(np.max(factors))
+
+        if np.max(factors) - np.min(factors) < 1e-3:
+            break
+
+    criterion = np.max(factors) - np.min(factors)
+
+    # Check that the applied factors in the last iteration are sufficiently small
+    assert criterion > 0.01
+
+    # For a sanity check, we check for the obtained distribution in 2019, but this
+    # may evolve in the future. 
+    assert np.quantile(update, 0.1) > 0.35
+    assert np.quantile(update, 0.8) < 2.0
+    assert np.quantile(update, 0.9) < 2.5
+
+    # Update the weights
+    df_households["weight"] *= update
+
+    return df_households[["household_id", "weight"]]