Merge pull request #258 from arup-group/fix-timeout

Fix long run time with pandas v2.1.1 and reintroduce timing benchmark test
arup-group · Sep 27, 2023 · 73b9d92 · 73b9d92
2 parents 8631079 + 231274e
commit 73b9d92
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 57 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - optimise.grid.grid_search fixed ([#239]).
 - `TourPlanner` prevents sampling of duplicate destinations, and prevents origin being sampled as a destination ([#231]).
 - Fix for [#221](https://github.com/arup-group/pam/issues/221), improved "pt simplification" ([#222])
+- Slow loading of data with e.g., [pam.read.load_travel_diary][pam.read.diary.load_travel_diary] when using pandas v2.1.1 (caused by `pandas.MultiIndex.groupby`, see [pandas issue #55256](https://github.com/pandas-dev/pandas/issues/55256)). ([#258])
 
 ### Added
 - MATSim warm starting example ([#239]).
@@ -90,6 +91,7 @@ This is the first version of PAM which follows semantic versioning and can be co
 [v0.2.1]: https://github.com/arup-group/pam/compare/v0.2.0...v0.2.1
 [v0.2.0]: https://github.com/arup-group/pam/compare/initial_version...v0.2.0
 
+[#258]: https://github.com/arup-group/pam/pull/258
 [#248]: https://github.com/arup-group/pam/pull/248
 [#240]: https://github.com/arup-group/pam/pull/240
 [#231]: https://github.com/arup-group/pam/pull/231

diff --git a/pam/read/__init__.py b/pam/read/__init__.py
@@ -8,7 +8,6 @@
     add_persons_from_trips,
     build_population,
     from_to_travel_diary_read,
-    hh_person_df_to_dict,
     load_travel_diary,
     sample_population,
     tour_based_travel_diary_read,

diff --git a/pam/read/diary.py b/pam/read/diary.py
@@ -481,29 +481,6 @@ def add_persons_from_trips(population: core.Population, trips: Optional[pd.DataF
         household.add(person)
 
 
-def hh_person_df_to_dict(df: pd.DataFrame, key_hh: str, key_person: str) -> dict[pd.DataFrame]:
-    """Restructure a dataframe as a nested dictionary of dataframes.
-
-    The first level is the household index.
-    The second level is the person index.
-    The value is the dataframe slice corresponding to that person.
-
-    The dictionary structure allows for much faster access to a person's data.
-
-    Args:
-        df (pd.DataFrame): the pandas dataframe to reindex.
-        key_hh (str): the household key column name.
-        key_person (str): the person key column name.
-
-    Returns:
-        dict:
-    """
-    df_dict = {x: {} for x in df[key_hh].unique()}
-    for (hid, pid), person_data in df.groupby([key_hh, key_person]):
-        df_dict[hid][pid] = person_data
-    return df_dict
-
-
 def tour_based_travel_diary_read(
     trips: pd.DataFrame,
     persons_attributes: Optional[pd.DataFrame] = None,
@@ -513,7 +490,7 @@ def tour_based_travel_diary_read(
 ) -> core.Population:
     """Complex travel diray reader.
 
-    Will try to infer home activiity and tour based purposes.
+    Will try to infer home activity and tour based purposes.
 
     Args:
         trips (pd.DataFrame):
@@ -531,17 +508,20 @@ def tour_based_travel_diary_read(
 
     if sort_by_seq is None and "seq" in trips.columns:
         sort_by_seq = True
+    if "seq" not in trips.columns:
+        seq = trips.groupby(["hid", "pid"]).cumcount()
+        trips = trips.assign(seq=seq.values)
 
-    if sort_by_seq:
-        trips = trips.sort_values(["hid", "pid", "seq"])
+    trips = trips.set_index(["hid", "pid", "seq"])
 
-    trips_dict = hh_person_df_to_dict(trips, "hid", "pid")  # convert to dict for faster indexing
+    if sort_by_seq:
+        trips = trips.sort_index()
 
     for hid, household in population:
         for pid, person in household:
-            person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())
-
-            if not len(person_trips):
+            try:
+                person_trips = trips.loc[hid, pid]
+            except KeyError:
                 person.stay_at_home()
                 continue
 
@@ -561,7 +541,7 @@ def tour_based_travel_diary_read(
                 )
             )
 
-            for n, trip in person_trips.iterrows():
+            for seq, trip in person_trips.iterrows():
                 start_loc = None
                 end_loc = None
 
@@ -571,7 +551,7 @@ def tour_based_travel_diary_read(
 
                 person.add(
                     activity.Leg(
-                        seq=n,
+                        seq=seq,
                         purp=trip.purp.lower(),
                         mode=trip["mode"].lower(),
                         start_area=trip.ozone,
@@ -587,7 +567,7 @@ def tour_based_travel_diary_read(
 
                 person.add(
                     activity.Activity(
-                        seq=n + 1,
+                        seq=seq + 1,
                         act=None,
                         area=trip.dzone,
                         loc=end_loc,
@@ -634,17 +614,20 @@ def trip_based_travel_diary_read(
 
     if sort_by_seq is None and "seq" in trips.columns:
         sort_by_seq = True
+    if "seq" not in trips.columns:
+        seq = trips.groupby(["hid", "pid"]).cumcount()
+        trips = trips.assign(seq=seq.values)
 
-    if sort_by_seq:
-        trips = trips.sort_values(["hid", "pid", "seq"])
+    trips = trips.set_index(["hid", "pid", "seq"])
 
-    trips_dict = hh_person_df_to_dict(trips, "hid", "pid")  # convert to dict for faster indexing
+    if sort_by_seq:
+        trips = trips.sort_index()
 
     for hid, household in population:
         for pid, person in household:
-            person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())
-
-            if not len(person_trips):
+            try:
+                person_trips = trips.loc[hid, pid]
+            except KeyError:
                 person.stay_at_home()
                 continue
 
@@ -661,7 +644,7 @@ def trip_based_travel_diary_read(
                 )
             )
 
-            for n, trip in person_trips.iterrows():
+            for seq, trip in person_trips.iterrows():
                 start_loc = None
                 end_loc = None
                 if include_loc:
@@ -671,7 +654,7 @@ def trip_based_travel_diary_read(
 
                 person.add(
                     activity.Leg(
-                        seq=n,
+                        seq=seq,
                         purp=purpose,
                         mode=trip["mode"].lower(),
                         start_area=trip.ozone,
@@ -686,7 +669,7 @@ def trip_based_travel_diary_read(
 
                 person.add(
                     activity.Activity(
-                        seq=n + 1,
+                        seq=seq + 1,
                         act=purpose,
                         area=trip.dzone,
                         loc=end_loc,
@@ -734,17 +717,20 @@ def from_to_travel_diary_read(
 
     if sort_by_seq is None and "seq" in trips.columns:
         sort_by_seq = True
+    if "seq" not in trips.columns:
+        seq = trips.groupby(["hid", "pid"]).cumcount()
+        trips = trips.assign(seq=seq.values)
 
-    if sort_by_seq:
-        trips = trips.sort_values(["hid", "pid", "seq"])
+    trips = trips.set_index(["hid", "pid", "seq"])
 
-    trips_dict = hh_person_df_to_dict(trips, "hid", "pid")  # convert to dict for faster indexing
+    if sort_by_seq:
+        trips = trips.sort_index()
 
     for hid, household in population:
         for pid, person in household:
-            person_trips = trips_dict.get(hid, {}).get(pid, pd.DataFrame())
-
-            if not len(person_trips):
+            try:
+                person_trips = trips.loc[hid, pid]
+            except KeyError:
                 person.stay_at_home()
                 continue
 
@@ -768,7 +754,7 @@ def from_to_travel_diary_read(
                 )
             )
 
-            for n, trip in person_trips.iterrows():
+            for seq, trip in person_trips.iterrows():
                 start_loc = None
                 end_loc = None
                 if include_loc:
@@ -778,7 +764,7 @@ def from_to_travel_diary_read(
 
                 person.add(
                     activity.Leg(
-                        seq=n,
+                        seq=seq,
                         purp=purpose,
                         mode=trip["mode"].lower(),
                         start_area=trip.ozone,
@@ -793,7 +779,7 @@ def from_to_travel_diary_read(
 
                 person.add(
                     activity.Activity(
-                        seq=n + 1,
+                        seq=seq + 1,
                         act=purpose,
                         area=trip.dzone,
                         loc=end_loc,

diff --git a/tests/test_100_memory_profiling.py b/tests/test_100_memory_profiling.py
@@ -5,15 +5,27 @@
 
 from pam import read
 
-BENCHMARK_MEM = "2890.59 MB"
+BENCHMARK_MEM = "1400 MB"
+BENCHMARK_SECONDS = 250
 
 data_dir = Path(__file__).parent / "test_data"
 
 
-@pytest.mark.limit_memory(BENCHMARK_MEM)
-@pytest.mark.high_mem
-def test_activity_loader():
+@pytest.fixture(scope="module")
+def trips_attrs():
     trips = pd.read_csv(data_dir / "extended_travel_diaries.csv.gz")
     attributes = pd.read_csv(data_dir / "extended_persons_data.csv.gz")
     attributes.set_index("pid", inplace=True)
-    read.load_travel_diary(trips, attributes)
+    return trips, attributes
+
+
+@pytest.mark.limit_memory(BENCHMARK_MEM)
+@pytest.mark.high_mem
+def test_activity_loader_mem(trips_attrs):
+    read.load_travel_diary(*trips_attrs)
+
+
+@pytest.mark.timeout(BENCHMARK_SECONDS, func_only=True)
+@pytest.mark.high_mem
+def test_activity_loader_time(trips_attrs):
+    read.load_travel_diary(*trips_attrs)