Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use future population projections #210

Merged
merged 25 commits into from
Feb 14, 2024
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

**Under development**

- feat: functionality to make use of INSEE population projection data
- update: don't remove households with people not living/studying in Île-de-France anymore to be more consistent with other use cases
- fix bug where always one household_id existed twice
- Fix read order when exploring files using `glob`
- Modes are only written now to `trips.csv` if `mode_choice` is activated
- Update to `eqasim-java` commit `7cbe85b`
Expand Down
18 changes: 1 addition & 17 deletions data/census/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def execute(context):

# Fill up undefined household ids (those where NUMMI == Z)
f = np.isnan(df["household_id"])
df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max()
df.loc[f, "household_id"] = np.arange(np.count_nonzero(f)) + df["household_id"].max() + 1
df["household_id"] = df["household_id"].astype(int)

# Put person IDs
Expand All @@ -48,17 +48,6 @@ def execute(context):
df.loc[f_undefined, "iris_id"] = "undefined"
df["iris_id"] = df["iris_id"].astype("category")

# Verify with requested codes
df_codes = context.stage("data.spatial.codes")

excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique())
if not excess_communes == {"undefined"}:
raise RuntimeError("Found additional communes: %s" % excess_communes)

excess_iris = set(df["iris_id"].unique()) - set(df_codes["iris_id"].unique())
if not excess_iris == {"undefined"}:
raise RuntimeError("Found additional IRIS: %s" % excess_iris)

# Age
df["age"] = df["AGED"].apply(lambda x: "0" if x == "000" else x.lstrip("0")).astype(int)

Expand Down Expand Up @@ -104,10 +93,6 @@ def execute(context):
# Socioprofessional category
df["socioprofessional_class"] = df["CS1"].astype(int)

# Place of work or education
df["work_outside_region"] = df["ILT"].isin(("4", "5", "6"))
df["education_outside_region"] = df["ILETUD"].isin(("4", "5", "6"))

# Consumption units
df = pd.merge(df, hts.calculate_consumption_units(df), on = "household_id")

Expand All @@ -117,6 +102,5 @@ def execute(context):
"age", "sex", "couple",
"commute_mode", "employed",
"studies", "number_of_vehicles", "household_size",
"work_outside_region", "education_outside_region",
"consumption_units", "socioprofessional_class"
]]
38 changes: 9 additions & 29 deletions data/census/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,18 @@ def configure(context):
def execute(context):
df = context.stage("data.census.cleaned")

# We remove people who study or work in another region
f = df["work_outside_region"] | df["education_outside_region"]
remove_ids = df[f]["household_id"].unique()

initial_households = len(df["household_id"].unique())
removed_households = len(remove_ids)

initial_persons = len(df["person_id"].unique())
removed_persons = np.count_nonzero(df["household_id"].isin(remove_ids))

# TODO: This filtering is not really compatible with defining multiple regions
# or departments. This used to be a filter to avoid people going outside of
# Île-de-France, but we should consider removing this filter altogether, or
# find some smarter way (e.g. using OD matrices and filter out people in
# each municipality by the share of outside workers).
# Filter requested codes
df_codes = context.stage("data.spatial.codes")

if len(df_codes["region_id"].unique()) > 1:
raise RuntimeError("""
Multiple regions are defined, so the filtering for people going outside
of Île-de-France does not make sense in that case. Consider adjusting the
data.census.filtered stage!
""")
requested_departements = df_codes["departement_id"].unique()
df = df[df["departement_id"].isin(requested_departements)]

print(
"Removing %d/%d (%.2f%%) households (with %d/%d persons, %.2f%%) because at least one person is working outside of Île-de-France" % (
removed_households, initial_households, 100 * removed_households / initial_households,
removed_persons, initial_persons, 100 * removed_persons / initial_persons
))
excess_communes = set(df["commune_id"].unique()) - set(df_codes["commune_id"].unique())
if not excess_communes == {"undefined"}:
raise RuntimeError("Found additional communes: %s" % excess_communes)

context.set_info("filtered_households_share", removed_households / initial_households)
context.set_info("filtered_persons_share", removed_persons / initial_persons)
excess_iris = set(df["iris_id"].unique()) - set(df_codes["iris_id"].unique())
if not excess_iris == {"undefined"}:
raise RuntimeError("Found additional IRIS: %s" % excess_iris)

df = df[~df["household_id"].isin(remove_ids)]
return df
75 changes: 75 additions & 0 deletions data/census/projection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd
import os

"""
This stage loads and cleans projection data about the French population.
"""

def configure(context):
context.config("data_path")
context.config("projection_path", "projection_2021")
context.config("projection_scenario", "00_central")
context.config("projection_year", None)

def execute(context):
source_path = "{}/{}/{}.xlsx".format(
context.config("data_path"),
context.config("projection_path"),
context.config("projection_scenario"))

projection_year = int(context.config("projection_year"))

df_all = pd.read_excel(
source_path, sheet_name = "population", skiprows = 1).iloc[:107]

df_male = pd.read_excel(
source_path, sheet_name = "populationH", skiprows = 1).iloc[:107]

df_female = pd.read_excel(
source_path, sheet_name = "populationF", skiprows = 1).iloc[:107]

df_male["sex"] = "male"
df_female["sex"] = "female"

assert df_all["Âge au 1er janvier"].iloc[-1] == "Total"
assert df_male["Âge au 1er janvier"].iloc[-1] == "Total des hommes"
assert df_female["Âge au 1er janvier"].iloc[-1] == "Total des femmes"

df_sex = pd.concat([
df_male.iloc[-1:],
df_female.iloc[-1:]
]).drop(columns = ["Âge au 1er janvier"])[["sex", projection_year]]
df_sex.columns = ["sex", "projection"]

df_age = df_all[["Âge au 1er janvier", projection_year]].iloc[:-1]
df_age.columns = ["age", "projection"]

df_male = df_male[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]
df_female = df_female[["Âge au 1er janvier", "sex", projection_year]].iloc[:-1]

df_male.columns = ["age", "sex", "projection"]
df_female.columns = ["age","sex", "projection"]

df_cross = pd.concat([df_male, df_female])
df_cross["sex"] = df_cross["sex"].astype("category")

df_total = df_all.iloc[-1:].drop(columns = ["Âge au 1er janvier"])[[projection_year]]
df_total.columns = ["projection"]

return {
"total": df_total, "sex": df_sex, "age": df_age, "cross": df_cross
}

def validate(context):
if context.config("projection_year") is not None:
source_path = "{}/{}/{}.xlsx".format(
context.config("data_path"),
context.config("projection_path"),
context.config("projection_scenario"))

if not os.path.exists(source_path):
raise RuntimeError("Projection data is not available")

return os.path.getsize(source_path)

return 0
14 changes: 9 additions & 5 deletions data/census/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ def configure(context):
context.config("census_path", "rp_2019/RP2019_INDCVI_csv.zip")
context.config("census_csv", "FD_INDCVI_2019.csv")

context.config("projection_year", None)

COLUMNS_DTYPES = {
"CANTVILLE":"str",
"NUMMI":"str",
"AGED":"str",
"COUPLE":"str",
"CS1":"str",
"DEPT":"str",
"ETUD":"str",
"ILETUD":"str",
"ILT":"str",
"ETUD":"str",
"IPONDI":"str",
"IRIS":"str",
"REGION":"str",
Expand All @@ -39,6 +39,9 @@ def execute(context):

requested_departements = df_codes["departement_id"].unique()

# only pre-filter if we don't need to reweight the census later
prefilter_departments = context.config("projection_year") is None

with context.progress(label = "Reading census ...") as progress:
with zipfile.ZipFile(
"{}/{}".format(context.config("data_path"), context.config("census_path"))) as archive:
Expand All @@ -50,8 +53,9 @@ def execute(context):

for df_chunk in csv:
progress.update(len(df_chunk))

df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]

if prefilter_departments:
df_chunk = df_chunk[df_chunk["DEPT"].isin(requested_departements)]

if len(df_chunk) > 0:
df_records.append(df_chunk)
Expand Down
23 changes: 23 additions & 0 deletions docs/population.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,26 @@ config:
```

Running the pipeline again will add the `mode` colum to the `trips.csv` file and its spatial equivalent.

### Population projections

The pipeline allows to make use of population projections from INSEE up to 2021. The same methodology can also be used to scale down the population. The process takes into account the marginal distribution of sex, age, their combination, and the total number of persons. The census data for the base year (see above) is reweighted according to those marginals using *Iterative Proportional Updating*.

- To make use of the scaling, [download the projection data from INSEE](https://www.insee.fr/fr/statistiques/5894093?sommaire=5760764). There are various scenarios in Excel format that you can choose from. The default is the *Scénario centrale*, the central scenario.
- Put the downloaded file into `data/projection_2021`, so you will have the file `data/projection_2021/00_central.xlsx`

Then, activate the projection procedure by defining the projection year in the configuration:

```yaml
config:
# [...]
projection_year: 2030
```

You may choose any year (past or future) that is contained in the projection scenario Excel file. In case you want to use a different scenario, download the corresponding file, put it into the folder mentioned above, and choose the scenario name via configuration:

```yaml
config:
# [...]
projection_scenario: 00_central
```
131 changes: 131 additions & 0 deletions synthesis/population/projection/ipu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import pandas as pd
import numpy as np

"""
This stage reweights the census data set according to the projection data for a different year.
"""

def configure(context):
context.stage("data.census.cleaned")
context.stage("data.census.projection")

def execute(context):
df_census = context.stage("data.census.cleaned")
projection = context.stage("data.census.projection")

# Prepare indexing
df_households = df_census[["household_id", "household_size", "weight"]].drop_duplicates("household_id")
df_households["household_index"] = np.arange(len(df_households))
df_census = pd.merge(df_census, df_households[["household_id", "household_index"]])

# Obtain weights and sizes as arrays
household_weights = df_households["weight"].values
household_sizes = df_households["household_size"].values

# Obtain the attribute levels and membership of attributes for all households
attributes = []

attribute_membership = []
attribute_counts = []
attribute_targets = []

# Proccesing age ...
df_marginal = projection["age"]
for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: age", total = len(df_marginal)):
f = df_census["age"] == row["age"]

if row["age"] == 0:
continue # we skip incompatible values for peopel of zero age

if np.count_nonzero(f) == 0:
print("Did not find age:", row["age"])

else:
df_counts = df_census.loc[f, "household_index"].value_counts()

attribute_targets.append(row["projection"])
attribute_membership.append(df_counts.index.values)
attribute_counts.append(df_counts.values)
attributes.append("age={}".format(row["age"]))

# Processing sex ...
df_marginal = projection["sex"]
for index, row in context.progress(df_marginal.iterrows(), label = "Processing attribute: sex", total = len(df_marginal)):
f = df_census["sex"] == row["sex"]

if np.count_nonzero(f) == 0:
print("Did not find sex:", row["sex"])

else:
df_counts = df_census.loc[f, "household_index"].value_counts()

attribute_targets.append(row["projection"])
attribute_membership.append(df_counts.index.values)
attribute_counts.append(df_counts.values)
attributes.append("sex={}".format(row["sex"]))

# Processing age x sex ...
df_marginal = projection["cross"]
for index, row in context.progress(df_marginal.iterrows(), label = "Processing attributes: sex x age", total = len(df_marginal)):
f = (df_census["sex"] == row["sex"]) & (df_census["age"] == row["age"])

if row["age"] == 0:
continue

if np.count_nonzero(f) == 0:
print("Did not find values:", row["sex"], row["age"])

else:
df_counts = df_census.loc[f, "household_index"].value_counts()

attribute_targets.append(row["projection"])
attribute_membership.append(df_counts.index.values)
attribute_counts.append(df_counts.values)
attributes.append("sex={},age={}".format(row["sex"], row["age"]))

# Processing total ...
attribute_targets.append(projection["total"]["projection"].values[0])
attribute_membership.append(np.arange(len(household_sizes)))
attribute_counts.append(household_sizes)
attributes.append("total")

# Perform IPU to obtain update weights
update = np.ones((len(df_households),))

minimum_factors = []
maximum_factors = []

for iteration in context.progress(range(100), label = "Performing IPU"):
factors = []
for k in np.arange(len(attributes)):
selection = attribute_membership[k]

target = attribute_targets[k]
current = np.sum(update[selection] * household_weights[selection] * attribute_counts[k])

factor = target / current
factors.append(factor)

update[selection] *= factor

minimum_factors.append(np.min(factors))
maximum_factors.append(np.max(factors))

if np.max(factors) - np.min(factors) < 1e-3:
break

criterion = np.max(factors) - np.min(factors)

# Check that the applied factors in the last iteration are sufficiently small
assert criterion > 0.01

# For a sanity check, we check for the obtained distribution in 2019, but this
# may evolve in the future.
assert np.quantile(update, 0.1) > 0.35
assert np.quantile(update, 0.8) < 2.0
assert np.quantile(update, 0.9) < 2.5

# Update the weights
df_households["weight"] *= update

return df_households[["household_id", "weight"]]
Loading
Loading