Merge pull request #458 from jeromekelleher/fix-plots

Fixup some issues with resource and sample plotting
jeromekelleher · Dec 18, 2024 · e9d1fdc · e9d1fdc
2 parents 33a72d6 + 22086e9
commit e9d1fdc
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 27 deletions.
diff --git a/sc2ts/info.py b/sc2ts/info.py
@@ -904,10 +904,7 @@ def samples_summary(self):
                 data.append({"date": date, **row})
         df = pd.DataFrame(data)
         df["inserted"] = df["total"] - df["rejected"] - df["exact_matches"]
-        if "total_hmm_cost" not in df:
-            # TMP! Remove this once we've got total_hmm_cost in the actual m
-            df["total_hmm_cost"] = df["mean_hmm_cost"] * df["total"]
-        return df.astype({"date": "datetime64[s]"})
+        return df
 
     def sample_groups_summary(self):
         data = []
@@ -1583,13 +1580,12 @@ def plot_deletion_overlaps(self, annotate_threshold=0.9):
     def plot_samples_per_day(
         self, start_date="2020-01-01", end_date="3000-01-01", scorpio_fraction=0.05
     ):
-        start_date = np.datetime64(start_date)
-        end_date = np.datetime64(end_date)
         df = self.samples_summary()
         df = df[(df.date >= start_date) & (df.date < end_date)]
 
-        dfa = df.groupby("date").sum().reset_index()
+        dfa = df.groupby("date").sum().reset_index().astype({"date": "datetime64[s]"})
         dfa["mean_hmm_cost"] = dfa["total_hmm_cost"] / dfa["total"]
+
         fig, (ax1, ax2, ax3, ax4) = self._wide_plot(4, height=12, sharex=True)
         exact_col = "tab:red"
         in_col = "tab:purple"
@@ -1630,7 +1626,9 @@ def plot_samples_per_day(
 
         df_scorpio = df.pivot_table(
             columns="scorpio", index="date", values="total", aggfunc="sum", fill_value=0
-        )
+        ).reset_index()
+        # Need force conversion back to datetime here for some reason
+        df_scorpio = df_scorpio.astype({"date": "datetime64[s]"}).set_index("date")
         # convert to fractions
         df_scorpio = df_scorpio.divide(df_scorpio.sum(axis="columns"), axis="index")
         # Remove columns that don't have more than the threshold
@@ -1668,18 +1666,13 @@ def plot_resources(self, start_date="2020-01-01", end_date="3000-01-01"):
         ts = self.ts
         fig, ax = self._wide_plot(3, height=8, sharex=True)
 
-        start_date = np.datetime64(start_date)
-        end_date = np.datetime64(end_date)
-        df = self.samples_summary()
-
-        dfs = self.samples_summary().set_index("date")
+        dfs = self.samples_summary()
         dfa = dfs.groupby("date").sum()
         dfa["mean_hmm_cost"] = dfa["total_hmm_cost"] / dfa["total"]
-        df = self.resources_summary().set_index("date")
-        # Should be able to do this with join, but I failed
-        df["samples_in_arg"] = dfa.loc[df.index]["inserted"]
-        df["samples_processed"] = dfa.loc[df.index]["total"]
-        df["mean_hmm_cost"] = dfa.loc[df.index]["mean_hmm_cost"]
+        df = dfa.join(self.resources_summary(), how="inner")
+        df = df.rename(
+            columns={"inserted": "smaples_in_arg", "total": "samples_processed"}
+        )
         df = df[(df.index >= start_date) & (df.index < end_date)]
 
         df["cpu_time"] = df.user_time + df.sys_time
@@ -1727,17 +1720,12 @@ def plot_resources(self, start_date="2020-01-01", end_date="3000-01-01"):
     def resources_summary(self):
         ts = self.ts
         data = []
-        df_samples = self.samples_summary()
-        dates = df_samples["date"].unique()
-        assert len(dates) == ts.num_provenances
-        for j in range(ts.num_provenances):
-            p = ts.provenance(j)
+        for p in ts.provenances():
             record = json.loads(p.record)
             text_date = record["parameters"]["date"]
-            assert text_date == str(dates[j]).split(" ")[0]
             resources = record["resources"]
-            data.append({"date": dates[j], **resources})
-        return pd.DataFrame(data)
+            data.append({"date": text_date, **resources})
+        return pd.DataFrame(data).set_index("date")
 
     def node_type_summary(self):
         ts = self.ts

diff --git a/tests/test_info.py b/tests/test_info.py
@@ -230,7 +230,7 @@ def test_draw_subtree(self, fx_ti_2020_02_13):
     def test_resources_summary(self, fx_ti_2020_02_13):
         df = fx_ti_2020_02_13.resources_summary()
         assert df.shape[0] == 20
-        assert np.all(df.date.astype(str).str.startswith("2020"))
+        assert np.all(df.index.astype(str).str.startswith("2020"))
 
     def test_samples_summary(self, fx_ti_2020_02_13):
         df = fx_ti_2020_02_13.samples_summary()