Skip to content

Commit

Permalink
testing and egt
Browse files Browse the repository at this point in the history
  • Loading branch information
sebhoerl committed Mar 18, 2024
1 parent d21ccc7 commit 4d4e5bf
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 41 deletions.
21 changes: 21 additions & 0 deletions data/hts/egt/cleaned.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
def configure(context):
context.stage("data.hts.egt.raw")

if context.config("use_urban_type", False):
context.stage("data.spatial.urban_type")

INCOME_CLASS_BOUNDS = [800, 1200, 1600, 2000, 2400, 3000, 3500, 4500, 5500, 1e6]

PURPOSE_MAP = {
Expand Down Expand Up @@ -111,6 +114,24 @@ def execute(context):
df_households.loc[df_households["income_class"].isin([10.0, 11.0, np.nan]), "income_class"] = -1
df_households["income_class"] = df_households["income_class"].astype(int)

# Impute urban type
if context.config("use_urban_type"):
df_urban_type = context.stage("data.spatial.urban_type")[[
"commune_id", "urban_type"
]]

# Household municipality
df_households["commune_id"] = df_households["RESCOMM"].astype("category")
df_persons = pd.merge(df_persons, df_households[["household_id", "commune_id"]], how = "left")
assert np.all(~df_persons["commune_id"].isna())

# Impute urban type
df_persons = pd.merge(df_persons, df_urban_type, on = "commune_id", how = "left")
df_persons["urban_type"] = df_persons["urban_type"].fillna("none").astype("category")

df_households.drop(columns = ["commune_id"])
df_persons.drop(columns = ["commune_id"])

# Trip purpose
df_trips["following_purpose"] = "other"
df_trips["preceding_purpose"] = "other"
Expand Down
13 changes: 9 additions & 4 deletions data/hts/egt/filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def configure(context):

def execute(context):
df_codes = context.stage("data.spatial.codes")
assert (df_codes["region_id"] == 11).all() # Otherwise EGT doesn't make sense

df_households, df_persons, df_trips = context.stage("data.hts.egt.cleaned")

Expand All @@ -39,9 +38,15 @@ def execute(context):
df_households = df_households[df_households["household_id"].isin(df_persons["household_id"])]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]]
df_persons = df_persons[hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]]
df_trips = df_trips[hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]]
household_columns = hts.HOUSEHOLD_COLUMNS + ["income_class"] + ["egt_household_id"]
df_households = df_households[household_columns]

person_columns = hts.PERSON_COLUMNS + ["egt_household_id", "egt_person_id"]
if "urban_type" in df_persons: person_columns.append("urban_type")
df_persons = df_persons[person_columns]

trip_columns = hts.TRIP_COLUMNS + ["euclidean_distance"] + ["egt_household_id", "egt_person_id", "egt_trip_id"]
df_trips = df_trips[trip_columns]

hts.check(df_households, df_persons, df_trips)

Expand Down
28 changes: 17 additions & 11 deletions data/spatial/urban_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def configure(context):
context.stage("data.spatial.municipalities")

context.config("data_path")
context.config("urban_type_path", "uu/UU2020_au_01-01-2023.zip")
context.config("urban_type_path", "urban_type/UU2020_au_01-01-2023.zip")

def execute(context):
with zipfile.ZipFile("{}/{}".format(
Expand All @@ -33,7 +33,7 @@ def execute(context):
df = pd.read_excel(f, sheet_name = "Composition_communale", skiprows = 5)

df = df[["CODGEO", "STATUT_2017"]].copy()
df = df.set_axis(["commune_id", "type_uu"], axis = "columns")
df = df.set_axis(["commune_id", "urban_type"], axis = "columns")

# Cities that have districts are not detailed in the UU file, only the whole city is mentioned
# However the municipalities file details the districts with their respective INSEE codes
Expand All @@ -43,21 +43,27 @@ def execute(context):

# Replacing each line of the UU file corresponding to a city with districts by multiple lines one for each districts
for city_code in cities_with_districts:
uu_type = df[df["commune_id"] == city_code].iloc[0].loc["type_uu"]
df.drop(df[df["commune_id"] == city_code].index, inplace=True)
new_lines = {"commune_id": [district_id for district_id in cities_with_districts[city_code]],
"type_uu": [uu_type for i in range(len(cities_with_districts[city_code]))]}
df = pd.concat([df, pd.DataFrame.from_dict(new_lines)])
base_type = df[df["commune_id"] == city_code].iloc[0]["urban_type"]
replacement_codes = cities_with_districts[city_code]

df = pd.concat([df, pd.DataFrame({
"commune_id": replacement_codes,
"urban_type": [base_type] * len(replacement_codes)
})])

df = df[~df["commune_id"].isin(cities_with_districts.keys())]

# Clean unités urbaines
df["type_uu"] = df["type_uu"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
assert np.all(~df["type_uu"].isna())
df["type_uu"] = df["type_uu"].astype("category")
df["urban_type"] = df["urban_type"].replace({"B":"suburb","C":"central_city","I":"isolated_city","H":"none"})
assert np.all(~df["urban_type"].isna())
df["urban_type"] = df["urban_type"].astype("category")

df_municipalities = context.stage("data.spatial.municipalities")
requested_communes = set(df_municipalities["commune_id"].unique())
df = df[df["commune_id"].isin(requested_communes)]


assert len(df["commune_id"].unique()) == len(df)

return df

def validate(context):
Expand Down
18 changes: 12 additions & 6 deletions synthesis/population/matched.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
}

DEFAULT_MATCHING_ATTRIBUTES = [
"sex", "any_cars", "age_class", "socioprofessional_class"
"sex", "any_cars", "age_class", "socioprofessional_class",
"departement_id"
]

def configure(context):
Expand Down Expand Up @@ -117,6 +118,9 @@ def statistical_matching(progress, df_source, source_identifier, weight, df_targ

progress.update(np.count_nonzero(unassigned_mask))

if np.count_nonzero(unassigned_mask) > 0:
raise RuntimeError("Some target observations could not be matched. Minimum observations configured too high?")

assert np.count_nonzero(unassigned_mask) == 0
assert np.count_nonzero(assigned_indices == -1) == 0

Expand Down Expand Up @@ -174,8 +178,7 @@ def execute(context):

try:
default_index = columns.index("*default*")
del columns[default_index]
columns.insert(default_index, DEFAULT_MATCHING_ATTRIBUTES)
columns[default_index:default_index + 1] = DEFAULT_MATCHING_ATTRIBUTES
except ValueError: pass

# Define matching attributes
Expand All @@ -199,9 +202,12 @@ def execute(context):
df_source = df_source.rename(columns = { "person_id": "hts_id" })

for column in columns:
assert column in df_source
assert column in df_target

if not column in df_source:
raise RuntimeError("Attribute not available in source (HTS) for matching: {}".format(column))

if not column in df_target:
raise RuntimeError("Attribute not available in target (census) for matching: {}".format(column))

df_assignment, levels = parallel_statistical_matching(
context,
df_source, "hts_id", "person_weight",
Expand Down
12 changes: 10 additions & 2 deletions tests/test_determinism.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ def _test_determinism(index, data_path, tmpdir):
regions = [10, 11], sampling_rate = 1.0, hts = "entd",
random_seed = 1000, processes = 1,
secloc_maximum_iterations = 10,
maven_skip_tests = True
maven_skip_tests = True,
matching_attributes = [
"sex", "any_cars", "age_class", "socioprofessional_class",
"income_class", "departement_id"
]
)

stages = [
Expand Down Expand Up @@ -111,7 +115,11 @@ def _test_determinism_matsim(index, data_path, tmpdir):
regions = [10, 11], sampling_rate = 1.0, hts = "entd",
random_seed = 1000, processes = 1,
secloc_maximum_iterations = 10,
maven_skip_tests = True
maven_skip_tests = True,
matching_attributes = [
"sex", "any_cars", "age_class", "socioprofessional_class",
"income_class", "departement_id"
]
)

stages = [
Expand Down
37 changes: 30 additions & 7 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import hashlib
from . import testdata
import pandas as pd

def test_data(tmpdir):
data_path = str(tmpdir.mkdir("data"))
Expand Down Expand Up @@ -34,7 +35,7 @@ def test_data(tmpdir):
assert os.path.isfile("%s/ile_de_france_hts_trips.csv" % output_path)
assert os.path.isfile("%s/ile_de_france_sirene.gpkg" % output_path)

def run_population(tmpdir, hts, mode_choice):
def run_population(tmpdir, hts, update = {}):
data_path = str(tmpdir.mkdir("data"))
testdata.create(data_path)

Expand All @@ -45,9 +46,9 @@ def run_population(tmpdir, hts, mode_choice):
regions = [10, 11], sampling_rate = 1.0, hts = hts,
random_seed = 1000, processes = 1,
secloc_maximum_iterations = 10,
maven_skip_tests = True,
mode_choice = mode_choice
maven_skip_tests = True
)
config.update(update)

stages = [
dict(descriptor = "synthesis.output"),
Expand All @@ -62,11 +63,33 @@ def run_population(tmpdir, hts, mode_choice):
assert os.path.isfile("%s/ile_de_france_trips.gpkg" % output_path)
assert os.path.isfile("%s/ile_de_france_meta.json" % output_path)

assert 2235 == len(pd.read_csv("%s/ile_de_france_activities.csv" % output_path, usecols = ["household_id"], sep = ";"))
assert 447 == len(pd.read_csv("%s/ile_de_france_persons.csv" % output_path, usecols = ["household_id"], sep = ";"))
assert 149 == len(pd.read_csv("%s/ile_de_france_households.csv" % output_path, usecols = ["household_id"], sep = ";"))

def test_population_with_entd(tmpdir):
run_population(tmpdir, "entd", False)
run_population(tmpdir, "entd")

def test_population_with_egt(tmpdir):
run_population(tmpdir, "egt")

def test_population_with_mode_choice(tmpdir):
run_population(tmpdir, "entd", True)
run_population(tmpdir, "entd", { "mode_choice": True })

def test_population_with_urban_type(tmpdir):
run_population(tmpdir, "entd", {
"use_urban_type": True,
"matching_attributes": [
"urban_type", "*default*"
],
"matching_minimum_observations": 5
})

#def test_population_with_egt(tmpdir):
# run_population(tmpdir, "entd") # TODO: Fix this!
def test_population_with_urban_type_and_egt(tmpdir):
run_population(tmpdir, "egt", {
"use_urban_type": True,
"matching_attributes": [
"urban_type", "*default*"
],
"matching_minimum_observations": 5
})
46 changes: 35 additions & 11 deletions tests/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def create(output_path):
"De 1 000", "De 1 200", "De 1 500", "De 1800",
"De 2 000", "De 2 500", "De 3 000", "De 4 000",
"De 6 000", "10 000"
]), numcom_UU2010 = random.choice(["B", "C", "I", "R"])
]), numcom_UU2010 = ["B", "C", "I", "R"][household_index % 4]
))

for person_index in range(HTS_HOUSEHOLD_MEMBERS):
Expand Down Expand Up @@ -388,8 +388,9 @@ def create(output_path):
trips = []
)

person_index = 0
for household_index in range(HTS_HOUSEHOLDS):
household_id = household_index
household_id = household_index * 1000 + 50

municipality = random.choice(df["municipality"].unique())
region = df[df["municipality"] == municipality]["region"].values[0]
Expand All @@ -402,8 +403,7 @@ def create(output_path):
MNP = 3, REVENU = random.randint(12)
))

for person_index in range(HTS_HOUSEHOLD_MEMBERS):
person_id = household_id * 1000 + person_index
for person_id in range(1, HTS_HOUSEHOLD_MEMBERS + 1):
studies = random.random_sample() < 0.3

data["persons"].append(dict(
Expand All @@ -421,15 +421,15 @@ def create(output_path):
work_region = df[df["municipality"] == work_municipality]["region"].values[0]
work_department = df[df["municipality"] == work_municipality]["department"].values[0]

purpose = 21 if studies else 11
purpose = 4 if studies else 2
mode = random.choice([1, 2, 3, 5, 7])

origin_hour = 8
origin_minute = 0

if person_index % 100 == 0:
# Testing proper diffusion of plan times
orign_hour = 0
origin_hour = 0
origin_minute = 12

data["trips"].append(dict(
Expand All @@ -442,18 +442,27 @@ def create(output_path):

data["trips"].append(dict(
NQUEST = household_id, NP = person_id,
ND = 1, ORDEP = work_department, DESTDEP = home_department,
ND = 2, ORDEP = work_department, DESTDEP = home_department,
ORH = 8, ORM = 0, DESTH = 9, DESTM = 0, ORCOMM = work_municipality,
DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
DESTMOT_H9 = 31, ORMOT_H9 = purpose
DESTMOT_H9 = 5, ORMOT_H9 = purpose
))

data["trips"].append(dict(
NQUEST = household_id, NP = person_id,
ND = 2, ORDEP = home_department, DESTDEP = home_department,
ND = 3, ORDEP = home_department, DESTDEP = home_department,
ORH = 17, ORM = 0, DESTH = 18, DESTM = 0, ORCOMM = home_municipality,
DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
DESTMOT_H9 = 1, ORMOT_H9 = 31
DESTMOT_H9 = 1, ORMOT_H9 = 5
))

# Tail
data["trips"].append(dict(
NQUEST = household_id, NP = person_id,
ND = 4, ORDEP = home_department, DESTDEP = home_department,
ORH = 22, ORM = 0, DESTH = 21, DESTM = 0, ORCOMM = home_municipality,
DESTCOMM = home_municipality, DPORTEE = 3, MODP_H7 = 2,
DESTMOT_H9 = 5, ORMOT_H9 = 1
))

os.mkdir("%s/egt_2010" % output_path)
Expand Down Expand Up @@ -657,7 +666,22 @@ def create(output_path):

df_sirene_geoloc.to_csv("%s/sirene/GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.zip" % output_path, index = False, sep=";", compression={'method': 'zip', 'archive_name': 'GeolocalisationEtablissement_Sirene_pour_etudes_statistiques_utf8.csv'})


# Data set: Urban type
print("Creating urban type ...")
df_urban_type = df_codes[["DEPCOM"]].copy().rename(columns = { "DEPCOM": "CODGEO" })
df_urban_type = df_urban_type.drop_duplicates()
df_urban_type["STATUT_2017"] = [["B", "C", "I", "H"][k % 4] for k in range(len(df_urban_type))]

df_urban_type = pd.concat([df_urban_type, pd.DataFrame({
"CODGEO": ["75056", "69123", "13055"],
"STATUT_2017": ["C", "C", "C"]
})])

os.mkdir("%s/urban_type" % output_path)
with zipfile.ZipFile("%s/urban_type/UU2020_au_01-01-2023.zip" % output_path, "w") as archive:
with archive.open("UU2020_au_01-01-2023.xlsx", "w") as f:
df_urban_type.to_excel(f, startrow = 5, sheet_name = "Composition_communale", index = False)

# Data set: OSM
# We add add a road grid of 500m
print("Creating OSM ...")
Expand Down

0 comments on commit 4d4e5bf

Please sign in to comment.