Skip to content

Commit

Permalink
Merge pull request #258 from arup-group/fix-timeout
Browse files Browse the repository at this point in the history
Fix long run time with pandas v2.1.1 and reintroduce timing benchmark test
  • Loading branch information
brynpickering authored Sep 27, 2023
2 parents 8631079 + 231274e commit 73b9d92
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 57 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- optimise.grid.grid_search fixed ([#239]).
- `TourPlanner` prevents sampling of duplicate destinations, and prevents origin being sampled as a destination ([#231]).
- Fix for [#221](https://github.com/arup-group/pam/issues/221), improved "pt simplification" ([#222])
- Slow loading of data with e.g., [pam.read.load_travel_diary][pam.read.diary.load_travel_diary] when using pandas v2.1.1 (caused by `pandas.MultiIndex.groupby`, see [pandas issue #55256](https://github.com/pandas-dev/pandas/issues/55256)). ([#258])

### Added
- MATSim warm starting example ([#239]).
Expand Down Expand Up @@ -90,6 +91,7 @@ This is the first version of PAM which follows semantic versioning and can be co
[v0.2.1]: https://github.com/arup-group/pam/compare/v0.2.0...v0.2.1
[v0.2.0]: https://github.com/arup-group/pam/compare/initial_version...v0.2.0

[#258]: https://github.com/arup-group/pam/pull/258
[#248]: https://github.com/arup-group/pam/pull/248
[#240]: https://github.com/arup-group/pam/pull/240
[#231]: https://github.com/arup-group/pam/pull/231
Expand Down
1 change: 0 additions & 1 deletion pam/read/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
add_persons_from_trips,
build_population,
from_to_travel_diary_read,
hh_person_df_to_dict,
load_travel_diary,
sample_population,
tour_based_travel_diary_read,
Expand Down
88 changes: 37 additions & 51 deletions pam/read/diary.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,29 +481,6 @@ def add_persons_from_trips(population: core.Population, trips: Optional[pd.DataF
household.add(person)


def hh_person_df_to_dict(df: pd.DataFrame, key_hh: str, key_person: str) -> dict[pd.DataFrame]:
"""Restructure a dataframe as a nested dictionary of dataframes.
The first level is the household index.
The second level is the person index.
The value is the dataframe slice corresponding to that person.
The dictionary structure allows for much faster access to a person's data.
Args:
df (pd.DataFrame): the pandas dataframe to reindex.
key_hh (str): the household key column name.
key_person (str): the person key column name.
Returns:
dict:
"""
df_dict = {x: {} for x in df[key_hh].unique()}
for (hid, pid), person_data in df.groupby([key_hh, key_person]):
df_dict[hid][pid] = person_data
return df_dict


def tour_based_travel_diary_read(
trips: pd.DataFrame,
persons_attributes: Optional[pd.DataFrame] = None,
Expand All @@ -513,7 +490,7 @@ def tour_based_travel_diary_read(
) -> core.Population:
"""Complex travel diray reader.
Will try to infer home activiity and tour based purposes.
Will try to infer home activity and tour based purposes.
Args:
trips (pd.DataFrame):
Expand All @@ -531,17 +508,20 @@ def tour_based_travel_diary_read(

if sort_by_seq is None and "seq" in trips.columns:
sort_by_seq = True
if "seq" not in trips.columns:
seq = trips.groupby(["hid", "pid"]).cumcount()
trips = trips.assign(seq=seq.values)

if sort_by_seq:
trips = trips.sort_values(["hid", "pid", "seq"])
trips = trips.set_index(["hid", "pid", "seq"])

trips_dict = hh_person_df_to_dict(trips, "hid", "pid") # convert to dict for faster indexing
if sort_by_seq:
trips = trips.sort_index()

for hid, household in population:
for pid, person in household:
person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())

if not len(person_trips):
try:
person_trips = trips.loc[hid, pid]
except KeyError:
person.stay_at_home()
continue

Expand All @@ -561,7 +541,7 @@ def tour_based_travel_diary_read(
)
)

for n, trip in person_trips.iterrows():
for seq, trip in person_trips.iterrows():
start_loc = None
end_loc = None

Expand All @@ -571,7 +551,7 @@ def tour_based_travel_diary_read(

person.add(
activity.Leg(
seq=n,
seq=seq,
purp=trip.purp.lower(),
mode=trip["mode"].lower(),
start_area=trip.ozone,
Expand All @@ -587,7 +567,7 @@ def tour_based_travel_diary_read(

person.add(
activity.Activity(
seq=n + 1,
seq=seq + 1,
act=None,
area=trip.dzone,
loc=end_loc,
Expand Down Expand Up @@ -634,17 +614,20 @@ def trip_based_travel_diary_read(

if sort_by_seq is None and "seq" in trips.columns:
sort_by_seq = True
if "seq" not in trips.columns:
seq = trips.groupby(["hid", "pid"]).cumcount()
trips = trips.assign(seq=seq.values)

if sort_by_seq:
trips = trips.sort_values(["hid", "pid", "seq"])
trips = trips.set_index(["hid", "pid", "seq"])

trips_dict = hh_person_df_to_dict(trips, "hid", "pid") # convert to dict for faster indexing
if sort_by_seq:
trips = trips.sort_index()

for hid, household in population:
for pid, person in household:
person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())

if not len(person_trips):
try:
person_trips = trips.loc[hid, pid]
except KeyError:
person.stay_at_home()
continue

Expand All @@ -661,7 +644,7 @@ def trip_based_travel_diary_read(
)
)

for n, trip in person_trips.iterrows():
for seq, trip in person_trips.iterrows():
start_loc = None
end_loc = None
if include_loc:
Expand All @@ -671,7 +654,7 @@ def trip_based_travel_diary_read(

person.add(
activity.Leg(
seq=n,
seq=seq,
purp=purpose,
mode=trip["mode"].lower(),
start_area=trip.ozone,
Expand All @@ -686,7 +669,7 @@ def trip_based_travel_diary_read(

person.add(
activity.Activity(
seq=n + 1,
seq=seq + 1,
act=purpose,
area=trip.dzone,
loc=end_loc,
Expand Down Expand Up @@ -734,17 +717,20 @@ def from_to_travel_diary_read(

if sort_by_seq is None and "seq" in trips.columns:
sort_by_seq = True
if "seq" not in trips.columns:
seq = trips.groupby(["hid", "pid"]).cumcount()
trips = trips.assign(seq=seq.values)

if sort_by_seq:
trips = trips.sort_values(["hid", "pid", "seq"])
trips = trips.set_index(["hid", "pid", "seq"])

trips_dict = hh_person_df_to_dict(trips, "hid", "pid") # convert to dict for faster indexing
if sort_by_seq:
trips = trips.sort_index()

for hid, household in population:
for pid, person in household:
person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())

if not len(person_trips):
try:
person_trips = trips.loc[hid, pid]
except KeyError:
person.stay_at_home()
continue

Expand All @@ -768,7 +754,7 @@ def from_to_travel_diary_read(
)
)

for n, trip in person_trips.iterrows():
for seq, trip in person_trips.iterrows():
start_loc = None
end_loc = None
if include_loc:
Expand All @@ -778,7 +764,7 @@ def from_to_travel_diary_read(

person.add(
activity.Leg(
seq=n,
seq=seq,
purp=purpose,
mode=trip["mode"].lower(),
start_area=trip.ozone,
Expand All @@ -793,7 +779,7 @@ def from_to_travel_diary_read(

person.add(
activity.Activity(
seq=n + 1,
seq=seq + 1,
act=purpose,
area=trip.dzone,
loc=end_loc,
Expand Down
22 changes: 17 additions & 5 deletions tests/test_100_memory_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,27 @@

from pam import read

BENCHMARK_MEM = "2890.59 MB"
BENCHMARK_MEM = "1400 MB"
BENCHMARK_SECONDS = 250

data_dir = Path(__file__).parent / "test_data"


@pytest.mark.limit_memory(BENCHMARK_MEM)
@pytest.mark.high_mem
def test_activity_loader():
@pytest.fixture(scope="module")
def trips_attrs():
trips = pd.read_csv(data_dir / "extended_travel_diaries.csv.gz")
attributes = pd.read_csv(data_dir / "extended_persons_data.csv.gz")
attributes.set_index("pid", inplace=True)
read.load_travel_diary(trips, attributes)
return trips, attributes


@pytest.mark.limit_memory(BENCHMARK_MEM)
@pytest.mark.high_mem
def test_activity_loader_mem(trips_attrs):
read.load_travel_diary(*trips_attrs)


@pytest.mark.timeout(BENCHMARK_SECONDS, func_only=True)
@pytest.mark.high_mem
def test_activity_loader_time(trips_attrs):
read.load_travel_diary(*trips_attrs)

0 comments on commit 73b9d92

Please sign in to comment.