fix: more progress on load_all_df, added derived.sleep, finished …

…Whoop sleep loading, fixed location loading, and many other fixes
ErikBjare · May 23, 2023 · f505855 · f505855
1 parent ebe720b
commit f505855
Show file tree

Hide file tree

Showing 10 changed files with 354 additions and 85 deletions.
diff --git a/config.example.toml b/config.example.toml
@@ -8,18 +8,17 @@ name = "john"
 date_offset_hours = 5
 
 [data]
-categories= "categories.example.toml"
-habitbull = "~/Downloads/HabitBullData.csv"
-location = "~/location"
-oura = "~/Downloads/oura_2020-02-27T09-07-47.json"
+categories= "~/work/quantifiedme/quantifiedme/categories.example.toml"
+#habitbull = "~/Downloads/HabitBullData.csv"
+#location = "~/location"
+#oura = "~/Downloads/oura_2020-02-27T09-07-47.json"
 
 [data.activitywatch]
 port = 5666
 hostnames = ["fakedata"]
 
-[data.smartertime_buckets]
-example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'
-
+#[data.smartertime_buckets]
+#example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'
 
 [locations]
     [locations.gym]

diff --git a/src/quantifiedme/derived/all_df.py b/src/quantifiedme/derived/all_df.py
@@ -1,13 +1,134 @@
 from aw_core import Event
-from typing import Literal
+from typing import Literal, TypeAlias
+from datetime import date, datetime, timedelta, timezone
 
-from .heartrate import load_heartrate_daily_df
-from .screentime import load_category_df
+import pandas as pd
 
-Sources = Literal["activitywatch", "heartrate"]
+from ..load.location import load_daily_df as load_location_daily_df
+from ..load.qslang import load_daily_df as load_drugs_df
+
+from .heartrate import load_heartrate_summary_df
+from .screentime import load_screentime_cached, load_category_df
+from .sleep import load_sleep_df
+
+Sources = Literal["screentime", "heartrate", "drugs", "location"]
+
+
+def load_all_df(
+    fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = []
+) -> pd.DataFrame:
+    """
+    Loads a bunch of data into a single dataframe with one row per day.
+    Serves as a useful starting point for further analysis.
+    """
+    df = pd.DataFrame()
+    since = datetime.now(tz=timezone.utc) - timedelta(days=30 if fast else 2*365) 
+
+    if "screentime" not in ignore:
+        print("Adding screentime")
+        if screentime_events is None:
+            screentime_events = load_screentime_cached(fast=fast, since=since)
+        df_time = load_category_df(screentime_events)
+        df_time = df_time[["Work", "Media", "ActivityWatch"]]
+        df = join(df, df_time.add_prefix("time:"))
 
-def load_all_df(events: list[Event], ignore: list[Sources] = []):
-    df = load_category_df(events)
     if "heartrate" not in ignore:
-        df = df.join(load_heartrate_daily_df(events))
+        print("Adding heartrate")
+        df_hr = load_heartrate_summary_df(freq="D")
+        # translate daily datetime column to a date column
+        df_hr.index = df_hr.index.date  # type: ignore
+        df = join(df, df_hr)
+
+    if "drugs" not in ignore:
+        print("Adding drugs")
+        # keep only columns starting with "tag"
+        df_drugs = load_drugs_df()
+        df_drugs = df_drugs[df_drugs.columns[df_drugs.columns.str.startswith("tag")]]
+        df = join(df, df_drugs)
+
+    if "location" not in ignore:
+        print("Adding location")
+        # TODO: add boolean for if sleeping together
+        df_location = load_location_daily_df()
+        df_location.index = df_location.index.date  # type: ignore
+        df = join(df, df_location.add_prefix("loc:"))
+
+    if "sleep" not in ignore:
+        df_sleep = load_sleep_df()
+        df = join(df, df_sleep.add_prefix("sleep:"))
+
+    # look for all-na columns, emit a warning, and drop them
+    na_cols = df.columns[df.isna().all()]
+    if len(na_cols) > 0:
+        print(f"Warning: dropping all-NA columns: {str(list(na_cols))}")
+        df = df.drop(columns=na_cols)
+
     return df
+
+
+
+def join(df_target: pd.DataFrame, df_source: pd.DataFrame) -> pd.DataFrame:
+    if not df_target.empty:
+        check_new_data_in_range(df_source, df_target)
+    print(f"Adding new columns: {str(list(df_source.columns.difference(df_target.columns)))}")
+    return df_target.join(df_source) if not df_target.empty else df_source
+
+
+DateLike: TypeAlias = datetime | date | pd.Timestamp
+
+
+def datelike_to_date(d: DateLike) -> date:
+    if isinstance(d, datetime) or isinstance(d, pd.Timestamp):
+        return d.date()
+    elif isinstance(d, date):
+        return d
+    else:
+        raise ValueError(f"Invalid type for datelike: {type(d)}")
+
+
+def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) -> None:
+    # check that source data covers target data, or emit warning
+    source_start = datelike_to_date(df_source.index.min())
+    source_end = datelike_to_date(df_source.index.max())
+    target_start = datelike_to_date(df_target.index.min())
+    target_end = datelike_to_date(df_target.index.max())
+
+    # check the worst case
+    if source_start > target_end or source_end < target_start:
+        print(
+            f"Warning: source data does not cover ANY of target data: ({source_start}/{source_end}) not in ({target_start}/{target_end})"
+        )
+    elif source_start > target_start:
+        print(
+            f"Warning: source data starts after target data (partial): {source_start} > {target_start}"
+        )
+    elif source_end < target_end:
+        print(
+            f"Warning: source data ends before target data (partial): {source_end} < {target_end}"
+        )
+
+
+if __name__ == "__main__":
+    import os
+    import logging
+    logging.basicConfig(level=logging.INFO)
+
+    # print a summary of all data
+    df = load_all_df(fast=os.environ.get("FAST", "1") == "1")
+    print(df)
+    print(df.describe())
+
+    # check for missing data
+    df_days_na = df.isna().sum()
+    df_days_na = df_days_na[df_days_na > 0]
+    if len(df_days_na) > 0:
+        print(f"Missing data for {len(df_days_na)} out of {len(df.columns)} columns")
+        print(df_days_na)
+    print("Total days: ", len(df))
+
+    # keep days with full coverage
+    df = df.dropna()
+    print("Total days with full coverage: ", len(df))
+
+    print("Final dataframe:")
+    print(df)
diff --git a/src/quantifiedme/derived/heartrate.py b/src/quantifiedme/derived/heartrate.py
@@ -26,27 +26,37 @@ def load_heartrate_df() -> pd.DataFrame:
     return df
 
 
-def load_heartrate_daily_df(
-    zones={"low": 100, "med": 140, "high": 160}, freq="D"
+def load_heartrate_minutes_df():
+    """We consider using minute-resolution a decent starting point for summary heartrate data.
+    
+    NOTE: ignores source, combines all sources into a single point per freq.
+    """
+    df = load_heartrate_df().drop(columns=["source"])
+    df = df.resample("1min").mean()
+    return df
+
+
+def load_heartrate_summary_df(
+    zones={"resting": 0, "low": 100, "med": 140, "high": 160}, freq="D"
 ) -> pd.DataFrame:
     """
-    Load heartrates, group into day, bin by zone, and return a dataframe.
-    
-    NOTE: Ignores source, combines all sources into a single point per freq.
+    Load heartrates, group into freq, bin by zone, and return a dataframe.
     """
-    source_df = load_heartrate_df().drop(columns=["source"])
+    source_df = load_heartrate_minutes_df()
     df = pd.DataFrame()
-    df["hr"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()
-    df["zone"] = pd.cut(
-        df["hr"], bins=[0, *zones.values(), 300], labels=["resting", *zones.keys()]
+    df["hr_mean"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()
+
+    # compute time spent in each zone
+    df_zones = pd.cut(
+        source_df["hr"], bins=[*zones.values(), 300], labels=[*zones.keys()]
     )
+    for zone in zones.keys():
+        df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby(
+            pd.Grouper(freq=freq)
+        ).count() * pd.Timedelta(minutes=1)
     return df
 
 
 if __name__ == "__main__":
-    df = load_heartrate_df()
-    print(df)
-    print(df.describe())
-
-    df = load_heartrate_daily_df()
+    df = load_heartrate_summary_df()
     print(df)
diff --git a/src/quantifiedme/derived/screentime.py b/src/quantifiedme/derived/screentime.py
@@ -1,3 +1,4 @@
+import pickle
 import logging
 from datetime import datetime, timezone, timedelta
 from pathlib import Path
@@ -37,10 +38,10 @@ def _get_aw_client(testing: bool) -> ActivityWatchClient:
 
 
 def load_screentime(
-    since: datetime | None,
-    datasources: list[DatasourceType] | None,
-    hostnames: list[str] | None,
-    personal: bool,
+    since: datetime | None = None,
+    datasources: list[DatasourceType] | None = None,
+    hostnames: list[str] | None = None,
+    personal: bool = True,
     cache: bool = True,
     awc: ActivityWatchClient | None = None,
 ) -> list[Event]:
@@ -122,6 +123,24 @@ def load_screentime(
 
     return events
 
+def load_screentime_cached(*args, since: datetime | None = None, fast = False, **kwargs) -> list[Event]:
+    # returns screentime from picked cache produced by Dashboard.ipynb (or here)
+    path = Path(__file__).parent.parent.parent.parent / "notebooks" / ("events_fast.pickle" if fast else "events.pickle")
+    if path.exists():
+        print(f"Loading from cache: {path}")
+        with open(path, "rb") as f:
+            events = pickle.load(f)
+        # if fast didn't get us enough data to satisfy the query, we need to load the rest
+        if fast and since and events[-1].timestamp < since:
+            print("Fast couldn't satisfy since, trying again without fast")
+            events = load_screentime_cached(fast=False, **kwargs)
+        # trim according to since
+        if since:
+            events = [e for e in events if e.timestamp >= since]
+        return events
+    else:
+        return load_screentime(*args, **kwargs)
+
 
 def _join_events(
     old_events: list[Event], new_events: list[Event], source: str

diff --git a/src/quantifiedme/derived/sleep.py b/src/quantifiedme/derived/sleep.py
@@ -0,0 +1,58 @@
+"""
+Aggregates sleep data from Fitbit, Oura, and Whoop into a single dataframe.
+"""
+
+from datetime import datetime, timedelta, timezone
+
+import pandas as pd
+
+from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df
+from ..load.oura import load_sleep_df as load_oura_sleep_df
+from ..load.whoop import load_sleep_df as load_whoop_sleep_df
+
+
+def load_sleep_df(ignore: list[str] = []) -> pd.DataFrame:
+    """
+    Loads sleep data from Fitbit, Oura, and Whoop into a single dataframe.
+    """
+    df = pd.DataFrame()
+
+    # Fitbit
+    #df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit")
+
+    # Oura
+    if "oura" not in ignore:
+        df_oura = load_oura_sleep_df()
+        df = join(df, df_oura.add_suffix("_oura"))
+
+    # Whoop
+    if "whoop" not in ignore:
+        df_whoop = load_whoop_sleep_df()
+        df = join(df, df_whoop.add_suffix("_whoop"))
+
+    # perform some aggregations
+    keys = list(set(col.split("_")[0] for col in df.columns) & {"duration", "score"})
+    for key in keys:
+        subkeys = df.columns[df.columns.str.startswith(key)]
+        df[key] = df[subkeys].mean(axis=1)
+    df = df[keys]
+
+    return df
+
+
+def join(df_target, df_source, **kwargs) -> pd.DataFrame:
+    if df_target.empty:
+        return df_source
+    else:
+        return df_target.join(df_source, **kwargs)
+
+
+if __name__ == "__main__":
+    df = load_sleep_df()
+    print(df)
+    """
+    df["duration_whoop"].plot()
+    import matplotlib.pyplot as plt
+
+    plt.show()
+    """
diff --git a/src/quantifiedme/load/fitbit.py b/src/quantifiedme/load/fitbit.py
@@ -6,6 +6,10 @@
 import pandas as pd
 
 
+def load_sleep_df() -> pd.DataFrame:
+    raise NotImplementedError
+
+
 def _load_heartrate_file(filepath):
     # print(f"Loading {filepath}...")
     # json format is {"dateTime": "2020-01-01", "value": {"bpm": 60, "confidence": 0}}