Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

boundary preprocessing #52

Merged
merged 15 commits into from
Oct 7, 2024
Merged
4 changes: 3 additions & 1 deletion config/base.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ seed = 0
region = "leeds"
number_of_households = 10000
zone_id = "OA21CD"
travel_times = true
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
boundary_geography = "OA"


[work_assignment]
use_percentages = true
Expand Down
4 changes: 3 additions & 1 deletion config/base_500.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ seed = 0
region = "leeds"
number_of_households = 500
zone_id = "OA21CD"
travel_times = true
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
boundary_geography = "OA"


[work_assignment]
use_percentages = true
Expand Down
3 changes: 2 additions & 1 deletion config/base_5000.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ seed = 0
region = "leeds"
number_of_households = 5000
zone_id = "OA21CD"
travel_times = true
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
boundary_geography = "OA"

[work_assignment]
use_percentages = true
Expand Down
4 changes: 3 additions & 1 deletion config/base_all.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
seed = 0
region = "leeds"
zone_id = "OA21CD"
travel_times = true
travel_times = true # Only set to true if you have travel time matrix at the level specified in boundary_geography
boundary_geography = "OA"


[work_assignment]
use_percentages = false
Expand Down
78 changes: 78 additions & 0 deletions scripts/0_preprocess_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import geopandas as gpd
import pandas as pd
from uatk_spc import Reader

import acbm
from acbm.cli import acbm_cli
from acbm.config import load_config
from acbm.logger_config import preprocessing_logger as logger
from acbm.preprocessing import edit_boundary_resolution


@acbm_cli
def main(config_file):
config = load_config(config_file)
config.init_rng()
region = config.region
# Pick a region with SPC output saved
spc_path = acbm.root_path / "data/external/spc_output/raw/"

# ----- BOUNDARIES
logger.info("Preprocessing Boundary Layer")

## Read in the boundary layer for the whole of England

logger.info("1. Reading in the boundary layer for the whole of England")

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson"
)

boundaries = boundaries.to_crs(epsg=4326)

## --- Dissolve boundaries if resolution is MSOA

boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA
logger.info(f"2. Dissolving boundaries to {boundary_geography} level")

boundaries = edit_boundary_resolution(
study_area=boundaries, geography=boundary_geography, zone_id=config.zone_id
)

## --- Filter to study area
# we filter using msoa21cd values, which exist regardless of the boundary resolution

logger.info("3. Filtering boundaries to specified study area")

# Step 1: Get zones from SPC (these will be 2011 MSOAs)
spc = Reader(spc_path, region, backend="pandas")
zones_in_region = list(spc.info_per_msoa.keys())

# Step 2: Filter boundaries to identified zones

# a) get MSOA11CD to MSOA21CD lookup
msoa_lookup = pd.read_csv(
acbm.root_path
/ "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv"
)
# Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region
msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)]
# Extract the corresponding MSOA21CD values
msoa21cd_values = msoa_lookup_filtered["MSOA21CD"].tolist()

# b) filter boundaries to include only rows where MSOA21CD is in msoa21cd_values
boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)]

## Save the output as parquet
logger.info(
f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path"
)

boundaries_filtered.to_file(
acbm.root_path / "data/external/boundaries/study_area_zones.geojson",
driver="GeoJSON",
)


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion scripts/1_prep_synthpop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ def main(config_file):

# Pick a region with SPC output saved
path = acbm.root_path / "data/external/spc_output/raw/"
region = "leeds"

# Add people and households
spc_people_hh = (
Expand Down
12 changes: 3 additions & 9 deletions scripts/3.1_assign_primary_feasible_zones.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,11 @@ def main(config_file):
# --- Study area boundaries

logger.info("Loading study area boundaries")
where_clause = "MSOA21NM LIKE '%Leeds%'"

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

# convert boundaries to 4326
boundaries = boundaries.to_crs(epsg=4326)

logger.info("Study area boundaries loaded")

# --- Assign activity home locations to boundaries zoning system
Expand Down Expand Up @@ -99,7 +94,7 @@ def main(config_file):
# If travel_times is not true or loading failed, create a new travel time matrix
logger.info("No travel time matrix found. Creating a new travel time matrix.")
# Create a new travel time matrix based on distances between zones
travel_times = zones_to_time_matrix(zones=boundaries, id_col="OA21CD")
travel_times = zones_to_time_matrix(zones=boundaries, id_col=config.zone_id)
logger.info("Travel time estimates created")

# --- Intrazonal trip times
Expand All @@ -115,8 +110,7 @@ def main(config_file):

logger.info("Creating intrazonal travel time estimates")

# TODO: use config zone_id instead of OA21CD
intrazone_times = intrazone_time(zones=boundaries, key_column="OA21CD")
intrazone_times = intrazone_time(zones=boundaries, key_column=config.zone_id)

logger.info("Intrazonal travel time estimates created")

Expand Down
7 changes: 3 additions & 4 deletions scripts/3.2.2_assign_primary_zone_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,13 @@ def main(config_file):

# --- boundaries

where_clause = "MSOA21NM LIKE '%Leeds%'"
logger.info("Loading study area boundaries")

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

boundaries = boundaries.to_crs(epsg=4326)
logger.info("Study area boundaries loaded")

# osm POI data

Expand Down
8 changes: 2 additions & 6 deletions scripts/3.2.3_assign_secondary_zone.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,11 @@ def main(config_file):

logger.info("Preprocessing: Adding OA21CD to the data")

where_clause = "MSOA21NM LIKE '%Leeds%'"

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

# convert boundaries to 4326
boundaries = boundaries.to_crs(epsg=4326)
logger.info("Study area boundaries loaded")

# --- Assign activity home locations to boundaries zoning system

Expand Down
10 changes: 4 additions & 6 deletions scripts/3.3_assign_facility_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,13 @@ def main(config_file):
)

# --- Load data: Boundaries
logger.info("Loading boundaries data")

where_clause = "MSOA21NM LIKE '%Leeds%'"
logger.info("Loading study area boundaries")

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)
boundaries = boundaries.to_crs(epsg=4326)

logger.info("Study area boundaries loaded")

# --- Prepprocess: add zone column to POI data
logger.info("Adding zone column to POI data")
Expand Down
1 change: 1 addition & 0 deletions scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

set -e

python scripts/0_preprocess_inputs.py --config_file $1
python scripts/1_prep_synthpop.py --config_file $1
python scripts/2_match_households_and_individuals.py --config_file $1
python scripts/3.1_assign_primary_feasible_zones.py --config_file $1
Expand Down
2 changes: 1 addition & 1 deletion src/acbm/assigning/feasible_zones_primary.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def get_possible_zones(

if travel_times is None:
logger.info("Travel time matrix not provided: Creating travel times estimates")
travel_times = zones_to_time_matrix(zones=boundaries, id_col="OA21CD")
travel_times = zones_to_time_matrix(zones=boundaries, id_col=zone_id)

list_of_modes = activity_chains["mode"].unique()
print(f"Unique modes found in the dataset are: {list_of_modes}")
Expand Down
1 change: 1 addition & 0 deletions src/acbm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class Parameters(BaseModel):
number_of_households: int | None = None
zone_id: str
travel_times: bool
boundary_geography: str


@dataclass(frozen=True)
Expand Down
1 change: 1 addition & 0 deletions src/acbm/logger_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def create_logger(name, log_file):


# Create loggers for different modules
preprocessing_logger = create_logger("preprocessing", "preprocessing.log")
matching_logger = create_logger("matching", "matching.log")
assigning_primary_feasible_logger = create_logger(
"assigning_primary_feasible", "assigning_primary_feasible.log"
Expand Down
54 changes: 53 additions & 1 deletion src/acbm/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,62 @@
import numpy as np
import pandas as pd
from pyproj import Transformer
from shapely import Point
from shapely.geometry import MultiPolygon, Point

import acbm

# ----- PREPROCESSING BOUNDARIES


def edit_boundary_resolution(
study_area: gpd.GeoDataFrame, geography: str, zone_id: str
) -> gpd.GeoDataFrame:
"""
This function takes a GeoDataFrame and a geography resolution as input and returns
a GeoDataFrame with the specified geography resolution. It dissolves OA boundaries
to MSOA boundaries if the geography resolution is set to "MSOA". Otherwise, it
retains the original OA boundaries. Currently it only works for OA and MSOA

Parameters
----------
study_area : gpd.GeoDataFrame
A GeoDataFrame containing the study area boundaries
geography : str
A string specifying the geography resolution. It can be either "OA" or "MSOA"
zone_id : str
The column name of the zone identifier in the study_area GeoDataFrame

Returns
-------
gpd.GeoDataFrame
A GeoDataFrame containing the study area boundaries with the specified geography

"""
# Dissolve based on the specified geography
if geography == "MSOA":
# Drop unnecessary columns (they are lower level than MSOA)
study_area = study_area[[zone_id, "geometry"]]

print("converting from OA to MSOA")
study_area = study_area.dissolve(by="MSOA21CD").reset_index()

elif geography == "OA":
# Drop unnecessary columns
study_area = study_area[
[zone_id, "MSOA21CD", "geometry"]
] # we always need MSOA21CD to filter to study area
print("keeping original OA boundaries")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we should raise an exception here to handle any cases where geography provided is not "OA" or "MSOA"? E.g.:

    else:
        msg = f"Invalid geography: '{geography}'. Expected 'OA' or 'MSOA'."
        raise ValueError(msg)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, done now: 5e47d0c

# Ensure all geometries are MultiPolygon
study_area["geometry"] = study_area["geometry"].apply(
lambda geom: MultiPolygon([geom]) if geom.geom_type == "Polygon" else geom
)

return study_area


# ----- MATCHING


def nts_filter_by_year(
data: pd.DataFrame, psu: pd.DataFrame, years: list
Expand Down
Loading