From 3de86772fe23d2dceca46e92da8616af1be37269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Bj=C3=A4reholt?= Date: Tue, 23 May 2023 19:12:23 +0200 Subject: [PATCH] fix: more progress on `load_all_df`, added `derived.sleep`, finished Whoop sleep loading, fixed location loading, and many other fixes --- config.example.toml | 13 ++- src/quantifiedme/derived/all_df.py | 137 +++++++++++++++++++++++-- src/quantifiedme/derived/heartrate.py | 38 ++++--- src/quantifiedme/derived/screentime.py | 27 ++++- src/quantifiedme/derived/sleep.py | 58 +++++++++++ src/quantifiedme/load/fitbit.py | 4 + src/quantifiedme/load/location.py | 85 +++++++-------- src/quantifiedme/load/oura.py | 14 ++- src/quantifiedme/load/qslang.py | 22 ++-- src/quantifiedme/load/whoop.py | 43 +++++++- tests/test_load.py | 34 ++++-- 11 files changed, 380 insertions(+), 95 deletions(-) create mode 100644 src/quantifiedme/derived/sleep.py diff --git a/config.example.toml b/config.example.toml index dad1e4d..8d3e5aa 100644 --- a/config.example.toml +++ b/config.example.toml @@ -8,18 +8,17 @@ name = "john" date_offset_hours = 5 [data] -categories= "categories.example.toml" -habitbull = "~/Downloads/HabitBullData.csv" -location = "~/location" -oura = "~/Downloads/oura_2020-02-27T09-07-47.json" +categories = "~/work/quantifiedme/quantifiedme/categories.example.toml" +#habitbull = "~/Downloads/HabitBullData.csv" +#location = "~/location" +#oura = "~/Downloads/oura_2020-02-27T09-07-47.json" [data.activitywatch] port = 5666 hostnames = ["fakedata"] -[data.smartertime_buckets] -example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json' - +#[data.smartertime_buckets] +#example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json' [locations] [locations.gym] diff --git a/src/quantifiedme/derived/all_df.py b/src/quantifiedme/derived/all_df.py index 5be8e3f..a697e84 100644 --- a/src/quantifiedme/derived/all_df.py +++ b/src/quantifiedme/derived/all_df.py @@ -1,13 +1,136 @@ +import os +import logging +from typing import Literal, TypeAlias +from datetime import date, datetime, timedelta, timezone + +import pandas as pd + from aw_core import Event -from typing import Literal -from .heartrate import load_heartrate_daily_df -from .screentime import load_category_df +from ..load.location import load_daily_df as load_location_daily_df +from ..load.qslang import load_daily_df as load_drugs_df + +from .heartrate import load_heartrate_summary_df +from .screentime import load_screentime_cached, load_category_df +from .sleep import load_sleep_df + +Sources = Literal["screentime", "heartrate", "drugs", "location", "sleep"] -Sources = Literal["activitywatch", "heartrate"] -def load_all_df(events: list[Event], ignore: list[Sources] = []): - df = load_category_df(events) +def load_all_df( + fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = [] +) -> pd.DataFrame: + """ + Loads a bunch of data into a single dataframe with one row per day. + Serves as a useful starting point for further analysis. + """ + df = pd.DataFrame() + since = datetime.now(tz=timezone.utc) - timedelta(days=30 if fast else 2 * 365) + + if "screentime" not in ignore: + print("Adding screentime") + if screentime_events is None: + screentime_events = load_screentime_cached(fast=fast, since=since) + df_time = load_category_df(screentime_events) + df_time = df_time[["Work", "Media", "ActivityWatch"]] + df = join(df, df_time.add_prefix("time:")) + if "heartrate" not in ignore: - df = df.join(load_heartrate_daily_df(events)) + print("Adding heartrate") + df_hr = load_heartrate_summary_df(freq="D") + # translate daily datetime column to a date column + df_hr.index = df_hr.index.date # type: ignore + df = join(df, df_hr) + + if "drugs" not in ignore: + print("Adding drugs") + # keep only columns starting with "tag" + df_drugs = load_drugs_df() + df_drugs = df_drugs[df_drugs.columns[df_drugs.columns.str.startswith("tag")]] + df = join(df, df_drugs) + + if "location" not in ignore: + print("Adding location") + # TODO: add boolean for if sleeping together + df_location = load_location_daily_df() + df_location.index = df_location.index.date # type: ignore + df = join(df, df_location.add_prefix("loc:")) + + if "sleep" not in ignore: + df_sleep = load_sleep_df() + df = join(df, df_sleep.add_prefix("sleep:")) + + # look for all-na columns, emit a warning, and drop them + na_cols = df.columns[df.isna().all()] + if len(na_cols) > 0: + print(f"Warning: dropping all-NA columns: {str(list(na_cols))}") + df = df.drop(columns=na_cols) + return df + + +def join(df_target: pd.DataFrame, df_source: pd.DataFrame) -> pd.DataFrame: + if not df_target.empty: + check_new_data_in_range(df_source, df_target) + print( + f"Adding new columns: {str(list(df_source.columns.difference(df_target.columns)))}" + ) + return df_target.join(df_source) if not df_target.empty else df_source + + +DateLike: TypeAlias = datetime | date | pd.Timestamp + + +def datelike_to_date(d: DateLike) -> date: + if isinstance(d, datetime) or isinstance(d, pd.Timestamp): + return d.date() + elif isinstance(d, date): + return d + else: + raise ValueError(f"Invalid type for datelike: {type(d)}") + + +def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) -> None: + # check that source data covers target data, or emit warning + source_start = datelike_to_date(df_source.index.min()) + source_end = datelike_to_date(df_source.index.max()) + target_start = datelike_to_date(df_target.index.min()) + target_end = datelike_to_date(df_target.index.max()) + + # check the worst case + if source_start > target_end or source_end < target_start: + print( + f"Warning: source data does not cover ANY of target data: ({source_start}/{source_end}) not in ({target_start}/{target_end})" + ) + elif source_start > target_start: + print( + f"Warning: source data starts after target data (partial): {source_start} > {target_start}" + ) + elif source_end < target_end: + print( + f"Warning: source data ends before target data (partial): {source_end} < {target_end}" + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + # print a summary of all data + df = load_all_df(fast=os.environ.get("FAST", "1") == "1") + print(df) + print(df.describe()) + + # check for missing data + df_days_na = df.isna().sum() + df_days_na = df_days_na[df_days_na > 0] + if len(df_days_na) > 0: + print(f"Missing data for {len(df_days_na)} out of {len(df.columns)} columns") + print(df_days_na) + print("Total days: ", len(df)) + + # keep days with full coverage + df = df.dropna() + print("Total days with full coverage: ", len(df)) + + print("Final dataframe:") + print(df) diff --git a/src/quantifiedme/derived/heartrate.py b/src/quantifiedme/derived/heartrate.py index c1cb17a..64187a3 100644 --- a/src/quantifiedme/derived/heartrate.py +++ b/src/quantifiedme/derived/heartrate.py @@ -26,27 +26,37 @@ def load_heartrate_df() -> pd.DataFrame: return df -def load_heartrate_daily_df( - zones={"low": 100, "med": 140, "high": 160}, freq="D" +def load_heartrate_minutes_df(): + """We consider using minute-resolution a decent starting point for summary heartrate data. + + NOTE: ignores source, combines all sources into a single point per freq. + """ + df = load_heartrate_df().drop(columns=["source"]) + df = df.resample("1min").mean() + return df + + +def load_heartrate_summary_df( + zones={"resting": 0, "low": 100, "med": 140, "high": 160}, freq="D" ) -> pd.DataFrame: """ - Load heartrates, group into day, bin by zone, and return a dataframe. - - NOTE: Ignores source, combines all sources into a single point per freq. + Load heartrates, group into freq, bin by zone, and return a dataframe. """ - source_df = load_heartrate_df().drop(columns=["source"]) + source_df = load_heartrate_minutes_df() df = pd.DataFrame() - df["hr"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean() - df["zone"] = pd.cut( - df["hr"], bins=[0, *zones.values(), 300], labels=["resting", *zones.keys()] + df["hr_mean"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean() + + # compute time spent in each zone + df_zones = pd.cut( + source_df["hr"], bins=[*zones.values(), 300], labels=[*zones.keys()] ) + for zone in zones.keys(): + df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby( + pd.Grouper(freq=freq) + ).count() * pd.Timedelta(minutes=1) return df if __name__ == "__main__": - df = load_heartrate_df() - print(df) - print(df.describe()) - - df = load_heartrate_daily_df() + df = load_heartrate_summary_df() print(df) diff --git a/src/quantifiedme/derived/screentime.py b/src/quantifiedme/derived/screentime.py index 3c67048..ec5f5ea 100644 --- a/src/quantifiedme/derived/screentime.py +++ b/src/quantifiedme/derived/screentime.py @@ -1,3 +1,4 @@ +import pickle import logging from datetime import datetime, timezone, timedelta from pathlib import Path @@ -37,10 +38,10 @@ def _get_aw_client(testing: bool) -> ActivityWatchClient: def load_screentime( - since: datetime | None, - datasources: list[DatasourceType] | None, - hostnames: list[str] | None, - personal: bool, + since: datetime | None = None, + datasources: list[DatasourceType] | None = None, + hostnames: list[str] | None = None, + personal: bool = True, cache: bool = True, awc: ActivityWatchClient | None = None, ) -> list[Event]: @@ -122,6 +123,24 @@ def load_screentime( return events +def load_screentime_cached(*args, since: datetime | None = None, fast = False, **kwargs) -> list[Event]: + # returns screentime from picked cache produced by Dashboard.ipynb (or here) + path = Path(__file__).parent.parent.parent.parent / "notebooks" / ("events_fast.pickle" if fast else "events.pickle") + if path.exists(): + print(f"Loading from cache: {path}") + with open(path, "rb") as f: + events = pickle.load(f) + # if fast didn't get us enough data to satisfy the query, we need to load the rest + if fast and since and events[-1].timestamp < since: + print("Fast couldn't satisfy since, trying again without fast") + events = load_screentime_cached(fast=False, **kwargs) + # trim according to since + if since: + events = [e for e in events if e.timestamp >= since] + return events + else: + return load_screentime(*args, **kwargs) + def _join_events( old_events: list[Event], new_events: list[Event], source: str diff --git a/src/quantifiedme/derived/sleep.py b/src/quantifiedme/derived/sleep.py new file mode 100644 index 0000000..5f7bafa --- /dev/null +++ b/src/quantifiedme/derived/sleep.py @@ -0,0 +1,58 @@ +""" +Aggregates sleep data from Fitbit, Oura, and Whoop into a single dataframe. +""" + +from datetime import datetime, timedelta, timezone + +import pandas as pd + +from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df +from ..load.oura import load_sleep_df as load_oura_sleep_df +from ..load.whoop import load_sleep_df as load_whoop_sleep_df + + +def load_sleep_df(ignore: list[str] = []) -> pd.DataFrame: + """ + Loads sleep data from Fitbit, Oura, and Whoop into a single dataframe. + """ + df = pd.DataFrame() + + # Fitbit + #df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit") + + # Oura + if "oura" not in ignore: + df_oura = load_oura_sleep_df() + df = join(df, df_oura.add_suffix("_oura")) + + # Whoop + if "whoop" not in ignore: + df_whoop = load_whoop_sleep_df() + df = join(df, df_whoop.add_suffix("_whoop")) + + # perform some aggregations + keys = list(set(col.split("_")[0] for col in df.columns) & {"duration", "score"}) + for key in keys: + subkeys = df.columns[df.columns.str.startswith(key)] + df[key] = df[subkeys].mean(axis=1) + df = df[keys] + + return df + + +def join(df_target, df_source, **kwargs) -> pd.DataFrame: + if df_target.empty: + return df_source + else: + return df_target.join(df_source, **kwargs) + + +if __name__ == "__main__": + df = load_sleep_df() + print(df) + """ + df["duration_whoop"].plot() + import matplotlib.pyplot as plt + + plt.show() + """ \ No newline at end of file diff --git a/src/quantifiedme/load/fitbit.py b/src/quantifiedme/load/fitbit.py index c5a79d7..cf1e400 100644 --- a/src/quantifiedme/load/fitbit.py +++ b/src/quantifiedme/load/fitbit.py @@ -6,6 +6,10 @@ import pandas as pd +def load_sleep_df() -> pd.DataFrame: + raise NotImplementedError + + def _load_heartrate_file(filepath): # print(f"Loading {filepath}...") # json format is {"dateTime": "2020-01-01", "value": {"bpm": 60, "confidence": 0}} diff --git a/src/quantifiedme/load/location.py b/src/quantifiedme/load/location.py index 76e14bf..2d6a568 100644 --- a/src/quantifiedme/load/location.py +++ b/src/quantifiedme/load/location.py @@ -15,6 +15,42 @@ memory = joblib.Memory(".cache") +@memory.cache +def load_all_dfs() -> dict[str, pd.DataFrame]: + dfs = {} + path = str(Path(load_config()["data"]["location"]).expanduser()) + for filepath in glob.glob(path + "/*.json"): + name = Path(filepath).name.replace(".json", "") + df = location_history_to_df(filepath) + dfs[name] = df + return dfs + + +def load_daily_df(whitelist: list[str] | None = None) -> pd.DataFrame: + """Returns a daily dataframe with how many hours were spent at each location or with each person.""" + config = load_config() + me = config["me"]["name"] + locations = config["locations"] + + df = pd.DataFrame(index=pd.DatetimeIndex([])) + dfs = load_all_dfs() + + for location in (whitelist or [*locations.keys(), *dfs.keys()]): + if location == me: + continue + if location in locations: + loc = locations[location] + df[location] = _proximity_to_location( + dfs[me], (loc["lat"], loc["long"]), threshold_radius=loc["accuracy"] + ) + elif location in dfs: + df[location] = colocate(dfs[me], dfs[location]) + else: + raise ValueError(f"Unknown location {location}") + + return df + + def location_history_to_df(fn, use_inferred_loc=False) -> pd.DataFrame: print(f"Loading location data from {fn}") with open(fn) as f: @@ -70,18 +106,7 @@ def location_history_to_df(fn, use_inferred_loc=False) -> pd.DataFrame: return df -@memory.cache -def load_all_dfs() -> dict[str, pd.DataFrame]: - dfs = {} - path = str(Path(load_config()["data"]["location"]).expanduser()) - for filepath in glob.glob(path + "/*.json"): - name = Path(filepath).name.replace(".json", "") - df = location_history_to_df(filepath) - dfs[name] = df - return dfs - - -def colocate(df_person1, df_person2, verbose=False): +def colocate(df_person1, df_person2, verbose=False) -> pd.DataFrame: df = df_person1.join(df_person2, lsuffix="_a", rsuffix="_b") df["dist"] = ( (df["lat_a"] - df["lat_b"]) ** 2 + (df["long_a"] - df["long_b"]) ** 2 @@ -131,48 +156,24 @@ def plot_df_duration(df, title, save: str | None = None) -> None: plt.show() -def main_plot(dfs, me, other, save=None, invert=False): - coords = load_config()["locations"] - - df = dfs[me] - - if other in coords: - loc = coords[other] - df = _proximity_to_location( - df, (loc["lat"], loc["long"]), threshold_radius=loc["accuracy"] - ) - else: - # df = colocate(dfs[me], dfs[args.other], start=args.start) - df_other = dfs[other] - df = colocate(df, df_other) - - if invert: - df = 24 - df - - # print(df) - plot_df_duration(df, other, save) - - @click.command() @click.argument("name") @click.option("--start", default=None, type=click.DateTime(), help="query from date") @click.option("--save", is_flag=True) -@click.option("--me", default=None) @click.option("--invert", is_flag=True) def locate( name: str, start: datetime, save: bool, me: str | None, invert: bool ) -> None: """Plot of when your location was proximate to some location NAME""" - if me is None: - me = load_config()["me"]["name"] - - dfs = load_all_dfs() - df = dfs[me] - + df = load_daily_df() if start: df = df[start < df.index] - main_plot(dfs, me, name, invert=invert) + if invert: + df = 24 - df + + # print(df) + plot_df_duration(df, name, "location.png" if save else None) if __name__ == "__main__": diff --git a/src/quantifiedme/load/oura.py b/src/quantifiedme/load/oura.py index c58d1f7..bc7bf06 100644 --- a/src/quantifiedme/load/oura.py +++ b/src/quantifiedme/load/oura.py @@ -22,8 +22,18 @@ def load_sleep_df() -> pd.DataFrame: data = load_data() df = pd.DataFrame(data["sleep"]) df["summary_date"] = pd.to_datetime(df["summary_date"]) - df = df.set_index("summary_date") - return df + df = df.rename(columns={"summary_date": "timestamp"}) + df = df.set_index("timestamp") + df["bedtime_start"] = pd.to_datetime(df["bedtime_start"]) + df["bedtime_end"] = pd.to_datetime(df["bedtime_end"]) + df = df.rename( + columns={ + "bedtime_start": "start", + "bedtime_end": "end", + } + ) + df["duration"] = df["end"] - df["start"] + return df[["start", "end", "duration", "score"]] def load_readiness_df() -> pd.DataFrame: diff --git a/src/quantifiedme/load/qslang.py b/src/quantifiedme/load/qslang.py index d687aa1..13890b7 100644 --- a/src/quantifiedme/load/qslang.py +++ b/src/quantifiedme/load/qslang.py @@ -139,19 +139,27 @@ def to_series( return series -def to_df_daily(events: list[Event]): +def load_daily_df(events: list[Event] | None = None) -> pd.DataFrame: """Returns a daily dataframe""" + if events is None: + events = load_events() df_src = load_df(events) df = pd.DataFrame() + tags = {tag for e in events for tag in e.data.get("tags", [])} - print(tags) - for tag in tags: - df[f"tag:{tag}"] = to_series(df_src, tag=tag) + series_tags = { + f"tag:{tag}": to_series(df_src, tag=tag).replace(np.nan, 0) + for tag in tags + } substances = {s for s in df_src["substance"] if s} - for subst in substances: - colname = subst.lower().replace("-", "").replace(" ", "") - df[colname] = to_series(df_src, substance=subst) + series_subst = { + subst.lower().replace("-", "").replace(" ", ""): to_series(df_src, substance=subst) + for subst in substances + } + df = pd.concat([df, pd.DataFrame(series_tags), pd.DataFrame(series_subst)], axis=1) + + return df def _missing_dates(): diff --git a/src/quantifiedme/load/whoop.py b/src/quantifiedme/load/whoop.py index a10a5a0..9f15b8d 100644 --- a/src/quantifiedme/load/whoop.py +++ b/src/quantifiedme/load/whoop.py @@ -5,6 +5,7 @@ """ from pathlib import Path +from datetime import timedelta import pandas as pd @@ -37,10 +38,48 @@ def load_heartrate_df() -> pd.DataFrame: return df -def test_load_whoop(): +def load_sleep_df() -> pd.DataFrame: + whoop_export_dir = load_config()["data"]["whoop"] + filename = Path(whoop_export_dir) / "Health" / "sleeps.csv" + df = pd.read_csv(filename.expanduser(), parse_dates=True) + import json + + # df columns are: "created_at","updated_at","activity_id","score","quality_duration","latency","max_heart_rate","average_heart_rate","debt_pre","debt_post","need_from_strain","sleep_need","habitual_sleep_need","disturbances","time_in_bed","light_sleep_duration","slow_wave_sleep_duration","rem_sleep_duration","cycles_count","wake_duration","arousal_time","no_data_duration","in_sleep_efficiency","credit_from_naps","hr_baseline","respiratory_rate","sleep_consistency","algo_version","projected_score","projected_sleep","optimal_sleep_times","kilojoules","user_id","during","timezone_offset","survey_response_id","percent_recorded","auto_detected","state","responded","team_act_id","source","is_significant","is_normal","is_nap" + # we are interested in the "during" column, which is a JSON string of a 2-tuple with isoformat timestamps + def parse_during(x): + try: + return eval(x.replace(")", "]")) + except: + print(x) + raise + df["start"] = pd.to_datetime(df["during"].apply(lambda x: parse_during(x)[0])) + df["end"] = pd.to_datetime(df["during"].apply(lambda x: parse_during(x)[1])) + df["duration"] = df["end"] - df["start"] + + # keep only the columns we want + df = df[["start", "end", "duration", "score"]] + + # set index and sort + offset = timedelta(hours=8) + df = df.set_index(pd.DatetimeIndex(df["start"] - offset).date) # type: ignore + df = df.sort_index() + + # rename index to timestamp + df.index.name = "timestamp" + + return df + + +def test_load_whoop_heartrate(): df = load_heartrate_df() print(df.head()) +def test_load_whoop_sleep(): + df = load_sleep_df() + print(df.head()) + + if __name__ == "__main__": - test_load_whoop() + test_load_whoop_sleep() + test_load_whoop_heartrate() diff --git a/tests/test_load.py b/tests/test_load.py index a619219..45dd492 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -22,13 +22,17 @@ from quantifiedme.derived.all_df import load_all_df from quantifiedme.derived.screentime import classify +from qslang import Event as QSEvent + now = datetime.now(tz=timezone.utc) + @pytest.fixture(scope="session", autouse=True) def setup(): - pd.set_option('display.max_colwidth', None) - pd.set_option('display.max_columns', None) - #pd.set_option('display.max_rows', None) + pd.set_option("display.max_colwidth", None) + pd.set_option("display.max_columns", None) + # pd.set_option('display.max_rows', None) + def load_example_events() -> list[Event]: events_cached_fast = Path("notebooks/events_fast.pickle") @@ -51,11 +55,10 @@ def test_load_example_events(): def test_load_all_df(): events = load_example_events() - df = load_all_df(events, ignore=["heartrate"]) + df = load_all_df(events, ignore=["heartrate", "location", "sleep"]) print(df) - @pytest.mark.skipif(not has_config(), reason="no config available for test data") def test_load_qslang(): df = load_df() @@ -76,7 +79,9 @@ def test_load_qslang(): assert (10e-6 <= series_nonzero).all() # Less than 500mg - assert (series_nonzero <= 500e-6).all(), series_nonzero[series_nonzero >= 500e-6] + assert (series_nonzero <= 500e-6).all(), series_nonzero[ + series_nonzero >= 500e-6 + ] for subst in ["Phenibut"]: series = to_series(df, substance=subst) @@ -95,11 +100,20 @@ def test_load_qslang(): def test_qslang_unknown_dose(): - from qslang import Event as QSEvent events = [ - QSEvent(timestamp=now, type="dose", data={"substance": "Caffeine", "amount": "?g"}), - QSEvent(timestamp=now, type="dose", data={"substance": "Caffeine", "amount": "100mg"}), - QSEvent(timestamp=now, type="dose", data={"substance": "Caffeine", "amount": "200mg"}), + QSEvent( + timestamp=now, type="dose", data={"substance": "Caffeine", "amount": "?g"} + ), + QSEvent( + timestamp=now, + type="dose", + data={"substance": "Caffeine", "amount": "100mg"}, + ), + QSEvent( + timestamp=now, + type="dose", + data={"substance": "Caffeine", "amount": "200mg"}, + ), ] df = load_df(events) assert 0.00015 == df.iloc[0]["dose"]