From 13401df86ebde07083bd2998a26fba509f1b2ac8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Jun 2024 13:05:55 +0200 Subject: [PATCH 1/3] update 23 files --- README.md | 47 +- docs/tutorials/01_basic.ipynb | 2384 ++++++++--------- docs/tutorials/02_advanced.ipynb | 8 +- docs/tutorials/03_text.ipynb | 4 +- example.py | 8 +- pyproject.toml | 2 +- src/test_benchmark.py | 6 +- src/timeseriesflattener/aggregators_test.py | 14 +- src/timeseriesflattener/intermediary.py | 35 +- src/timeseriesflattener/main.py | 95 +- src/timeseriesflattener/main_test.py | 46 +- .../processors/static_test.py | 4 +- .../processors/temporal.py | 16 +- .../processors/temporal_test.py | 48 +- src/timeseriesflattener/specs/outcome.py | 4 +- .../specs/prediction_times.py | 21 +- src/timeseriesflattener/specs/static.py | 10 +- src/timeseriesflattener/specs/temporal.py | 2 +- src/timeseriesflattener/specs/test_specs.py | 4 +- src/timeseriesflattener/specs/timedelta.py | 7 +- src/timeseriesflattener/specs/timestamp.py | 10 +- src/timeseriesflattener/specs/value.py | 19 +- src/timeseriesflattener/utils.py | 12 +- 23 files changed, 1376 insertions(+), 1430 deletions(-) diff --git a/README.md b/README.md index 271d4d82..19d445a6 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,14 @@ [![PyPI version](https://badge.fury.io/py/timeseriesflattener.svg)](https://pypi.org/project/timeseriesflattener/) [![status](https://joss.theoj.org/papers/3bbea8745668d1aa40ff796c6fd3db87/status.svg)](https://joss.theoj.org/papers/3bbea8745668d1aa40ff796c6fd3db87) -Time series from e.g. electronic health records often have a large number of variables, are sampled at irregular intervals and tend to have a large number of missing values. Before this type of data can be used for prediction modelling with machine learning methods such as logistic regression or XGBoost, the data needs to be reshaped. +Time series from e.g. electronic health records often have a large number of variables, are sampled at irregular intervals and tend to have a large number of missing values. Before this type of data can be used for prediction modelling with machine learning methods such as logistic regression or XGBoost, the data needs to be reshaped. -In essence, the time series need to be *flattened* so that each prediction time is represented by a set of predictor values and an outcome value. These predictor values can be constructed by aggregating the preceding values in the time series within a certain time window. +In essence, the time series need to be _flattened_ so that each prediction time is represented by a set of predictor values and an outcome value. These predictor values can be constructed by aggregating the preceding values in the time series within a certain time window. -`timeseriesflattener` aims to simplify this process by providing an easy-to-use and fully-specified pipeline for flattening complex time series. +`timeseriesflattener` aims to simplify this process by providing an easy-to-use and fully-specified pipeline for flattening complex time series. ## ๐Ÿ”ง Installation + To get started using timeseriesflattener simply install it using pip by running the following line in your terminal: ``` @@ -58,7 +59,7 @@ from timeseriesflattener import ( predictor_spec = PredictorSpec( value_frame=ValueFrame( - init_df=predictor_df.lazy(), entity_id_col_name="id", value_timestamp_col_name="date" + init_df=predictor_df, entity_id_col_name="id", value_timestamp_col_name="date" ), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MaxAggregator(), MinAggregator()], @@ -68,7 +69,7 @@ predictor_spec = PredictorSpec( outcome_spec = OutcomeSpec( value_frame=ValueFrame( - init_df=outcome_df.lazy(), entity_id_col_name="id", value_timestamp_col_name="date" + init_df=outcome_df, entity_id_col_name="id", value_timestamp_col_name="date" ), lookahead_distances=[dt.timedelta(days=1)], aggregators=[MaxAggregator(), MinAggregator()], @@ -81,29 +82,29 @@ from timeseriesflattener import Flattener result = Flattener( predictiontime_frame=PredictionTimeFrame( - init_df=prediction_times_df.lazy(), entity_id_col_name="id", timestamp_col_name="date" + init_df=prediction_times_df, entity_id_col_name="id", timestamp_col_name="date" ) ).aggregate_timeseries(specs=[predictor_spec, outcome_spec]) -result.collect() +result ``` -Output: -| | id | date | prediction_time_uuid | pred_test_feature_within_30_days_mean_fallback_nan | outc_test_outcome_within_31_days_maximum_fallback_0_dichotomous | -| ---: | ---: | :------------------ | :-------------------- | -------------------------------------------------: | --------------------------------------------------------------: | -| 0 | 1 | 2020-01-01 00:00:00 | 1-2020-01-01-00-00-00 | 2.5 | 0 | -| 1 | 1 | 2020-02-01 00:00:00 | 1-2020-02-01-00-00-00 | 1 | 1 | -| 2 | 2 | 2020-02-01 00:00:00 | 2-2020-02-01-00-00-00 | 4 | 0 | +Output: +| | id | date | prediction_time_uuid | pred_test_feature_within_30_days_mean_fallback_nan | outc_test_outcome_within_31_days_maximum_fallback_0_dichotomous | +| --: | --: | :------------------ | :-------------------- | -------------------------------------------------: | --------------------------------------------------------------: | +| 0 | 1 | 2020-01-01 00:00:00 | 1-2020-01-01-00-00-00 | 2.5 | 0 | +| 1 | 1 | 2020-02-01 00:00:00 | 1-2020-02-01-00-00-00 | 1 | 1 | +| 2 | 2 | 2020-02-01 00:00:00 | 2-2020-02-01-00-00-00 | 4 | 0 | ## ๐Ÿ“– Documentation -| Documentation | | -| ---------------------- | -------------------------------------------------------------------------------------- | -| ๐ŸŽ“ **[Tutorial]** | Simple and advanced tutorials to get you started using `timeseriesflattener` | -| ๐ŸŽ› **[General docs]** | The detailed reference for timeseriesflattener's API. | -| ๐Ÿ™‹ **[FAQ]** | Frequently asked question | -| ๐Ÿ—บ๏ธ **[Roadmap]** | Kanban board for the roadmap for the project | +| Documentation | | +| -------------------- | ---------------------------------------------------------------------------- | +| ๐ŸŽ“ **[Tutorial]** | Simple and advanced tutorials to get you started using `timeseriesflattener` | +| ๐ŸŽ› **[General docs]** | The detailed reference for timeseriesflattener's API. | +| ๐Ÿ™‹ **[FAQ]** | Frequently asked question | +| ๐Ÿ—บ๏ธ **[Roadmap]** | Kanban board for the roadmap for the project | [Tutorial]: https://aarhus-psychiatry-research.github.io/timeseriesflattener/tutorials.html [General docs]: https://Aarhus-Psychiatry-Research.github.io/timeseriesflattener/ @@ -112,16 +113,16 @@ Output: ## ๐Ÿ’ฌ Where to ask questions -| Type | | -| ------------------------------ | ---------------------- | +| Type | | +| ------------------------------- | ---------------------- | | ๐Ÿšจ **Bug Reports** | [GitHub Issue Tracker] | | ๐ŸŽ **Feature Requests & Ideas** | [GitHub Issue Tracker] | | ๐Ÿ‘ฉโ€๐Ÿ’ป **Usage Questions** | [GitHub Discussions] | -| ๐Ÿ—ฏ **General Discussion** | [GitHub Discussions] | +| ๐Ÿ—ฏ **General Discussion** | [GitHub Discussions] | [github issue tracker]: https://github.com/Aarhus-Psychiatry-Research/timeseriesflattener/issues [github discussions]: https://github.com/Aarhus-Psychiatry-Research/timeseriesflattener/discussions - ## ๐ŸŽ“ Projects + PSYCOP projects use `timeseriesflattener`, see more at the [monorepo](https://github.com/Aarhus-Psychiatry-Research/psycop-common/tree/main/psycop/projects). diff --git a/docs/tutorials/01_basic.ipynb b/docs/tutorials/01_basic.ipynb index ff7d4c66..a83e1571 100644 --- a/docs/tutorials/01_basic.ipynb +++ b/docs/tutorials/01_basic.ipynb @@ -1,1194 +1,1194 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Introductory Tutorial\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "TimeseriesFlattener flattens timeseries. This is especially helpful if you have complicated and irregular time series but want to train simple models.\n", - "\n", - "We explain terminology as needed in this tutorial. If you need a reference, see the [docs](https://aarhus-psychiatry-research.github.io/timeseriesflattener/#functionality).\n", - "\n", - "Applying it consists of 3 steps:\n", - "\n", - "1. Loading data (prediction times, predictor(s), and outcome(s))\n", - "2. Specifying how to flatten the data and\n", - "3. Flattening\n", - "\n", - "The simplest case is adding one predictor and one outcome.\n", - "\n", - "First, we'll load the timestamps for every time we want to issue a prediction:\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading data\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading prediction times\n", - "\n", - "Predictin times consist of two elements:\n", - "\n", - "1. The entity id. This is the entity about which the prediction is issued. In medical contexts, this is frequently a patient.\n", - "2. The timestamp at which the prediction is to be issued.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
-       "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
-       "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of rows    โ”‚ 10000  โ”‚ โ”‚ int64       โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of columns โ”‚ 2      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
-       "โ”‚                                                     number                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    0 โ”‚   2500 โ”‚   4900 โ”‚   7400 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ”‚                                                    datetime                                                     โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 09:35:00     โ”‚    1969-12-31 21:42:00     โ”‚ None         โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
-       "
\n" - ], - "text/plain": [ - "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", - "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ Number of rows โ”‚ 10000 โ”‚ โ”‚ int64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ”‚ Number of columns โ”‚ 2 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 4900\u001b[0m โ”‚ \u001b[36m 7400\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 09:35:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 21:42:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introductory Tutorial\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TimeseriesFlattener flattens timeseries. This is especially helpful if you have complicated and irregular time series but want to train simple models.\n", + "\n", + "We explain terminology as needed in this tutorial. If you need a reference, see the [docs](https://aarhus-psychiatry-research.github.io/timeseriesflattener/#functionality).\n", + "\n", + "Applying it consists of 3 steps:\n", + "\n", + "1. Loading data (prediction times, predictor(s), and outcome(s))\n", + "2. Specifying how to flatten the data and\n", + "3. Flattening\n", + "\n", + "The simplest case is adding one predictor and one outcome.\n", + "\n", + "First, we'll load the timestamps for every time we want to issue a prediction:\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading data\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading prediction times\n", + "\n", + "Predictin times consist of two elements:\n", + "\n", + "1. The entity id. This is the entity about which the prediction is issued. In medical contexts, this is frequently a patient.\n", + "2. The timestamp at which the prediction is to be issued.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
+                            "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
+                            "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of rows    โ”‚ 10000  โ”‚ โ”‚ int64       โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of columns โ”‚ 2      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
+                            "โ”‚                                                     number                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    0 โ”‚   2500 โ”‚   4900 โ”‚   7400 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ”‚                                                    datetime                                                     โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 09:35:00     โ”‚    1969-12-31 21:42:00     โ”‚ None         โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
+                            "
\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ Number of rows โ”‚ 10000 โ”‚ โ”‚ int64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ”‚ Number of columns โ”‚ 2 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 4900\u001b[0m โ”‚ \u001b[36m 7400\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 09:35:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 21:42:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (10_000, 2)
entity_idtimestamp
i64datetime[ฮผs]
01969-01-11 09:55:00
11965-03-15 07:16:00
21969-09-13 23:18:00
31968-02-04 16:16:00
41965-01-28 12:33:00
51967-10-09 06:22:00
71969-11-17 02:50:00
81965-12-11 02:17:00
91965-08-21 22:00:00
101965-12-26 16:45:00
111966-07-09 13:54:00
121968-01-26 22:35:00
99901965-11-16 21:02:00
99901967-06-09 02:48:00
99911966-07-05 03:18:00
99941965-04-20 06:19:00
99941967-05-07 04:44:00
99951968-08-23 11:30:00
99951965-11-24 05:42:00
99961965-01-30 17:19:00
99961965-07-18 17:12:00
99971967-06-08 07:52:00
99991965-07-19 14:59:00
99991968-02-07 22:24:00
" + ], + "text/plain": [ + "shape: (10_000, 2)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† timestamp โ”‚\n", + "โ”‚ --- โ”† --- โ”‚\n", + "โ”‚ i64 โ”† datetime[ฮผs] โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 0 โ”† 1969-01-11 09:55:00 โ”‚\n", + "โ”‚ 1 โ”† 1965-03-15 07:16:00 โ”‚\n", + "โ”‚ 2 โ”† 1969-09-13 23:18:00 โ”‚\n", + "โ”‚ 3 โ”† 1968-02-04 16:16:00 โ”‚\n", + "โ”‚ 4 โ”† 1965-01-28 12:33:00 โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 9996 โ”† 1965-01-30 17:19:00 โ”‚\n", + "โ”‚ 9996 โ”† 1965-07-18 17:12:00 โ”‚\n", + "โ”‚ 9997 โ”† 1967-06-08 07:52:00 โ”‚\n", + "โ”‚ 9999 โ”† 1965-07-19 14:59:00 โ”‚\n", + "โ”‚ 9999 โ”† 1968-02-07 22:24:00 โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from __future__ import annotations\n", + "\n", + "from skimpy import skim\n", + "from timeseriesflattener.testing.load_synth_data import load_synth_prediction_times\n", + "\n", + "df_prediction_times = load_synth_prediction_times()\n", + "\n", + "skim(df_prediction_times)\n", + "df_prediction_times.sort([\"entity_id\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, \"entity_id\" represents a patient ID and โ€œtimestampโ€ refers to the time when we want to issue a prediction. Note that each ID can have multiple prediction times.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading a temporal predictor\n", + "\n", + "Then, we'll load the values for our temporal predictor. Temporal predictors are predictors that can have a different value at different timepoints.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
+                            "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
+                            "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of rows    โ”‚ 100000 โ”‚ โ”‚ int64       โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of columns โ”‚ 3      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ float64     โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚                                โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
+                            "โ”‚                                                     number                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name     โ”ƒ NA   โ”ƒ NA %   โ”ƒ mean   โ”ƒ sd     โ”ƒ p0        โ”ƒ p25    โ”ƒ p50    โ”ƒ p75   โ”ƒ p100   โ”ƒ hist    โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ entity_id       โ”‚    0 โ”‚      0 โ”‚   5000 โ”‚   2900 โ”‚         0 โ”‚   2500 โ”‚   5000 โ”‚  7500 โ”‚  10000 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ value           โ”‚    0 โ”‚      0 โ”‚      5 โ”‚    2.9 โ”‚   0.00015 โ”‚    2.5 โ”‚      5 โ”‚   7.5 โ”‚     10 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ”‚                                                    datetime                                                     โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 00:01:00     โ”‚    1969-12-31 23:37:00     โ”‚ None         โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
+                            "
\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ Number of rows โ”‚ 100000 โ”‚ โ”‚ int64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ”‚ Number of columns โ”‚ 3 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ float64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 7500\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mvalue \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 2.9\u001b[0m โ”‚ \u001b[36m 0.00015\u001b[0m โ”‚ \u001b[36m 2.5\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 7.5\u001b[0m โ”‚ \u001b[36m 10\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 00:01:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 23:37:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (100_000, 3)
entity_idtimestampvalue
i64datetime[ฮผs]f64
01967-06-12 14:06:000.174793
01968-04-15 01:45:003.072293
01968-12-09 05:42:001.315754
01969-06-20 18:07:002.812481
01967-11-26 01:59:002.981185
01968-09-07 01:45:000.173205
01969-02-21 03:29:009.943505
01967-11-26 13:45:005.470792
01967-05-12 12:44:000.970382
01965-05-03 05:23:006.630007
01969-10-19 10:29:000.799246
01969-11-25 18:39:002.868688
99991967-12-11 21:34:005.379503
99991968-05-26 12:28:009.645746
99991966-04-23 02:37:008.924175
99991968-10-24 18:47:002.34615
99991967-11-12 07:33:004.414524
99991966-07-15 20:55:000.684468
99991968-01-08 02:41:003.250538
99991968-08-19 10:15:000.671907
99991966-01-03 22:34:004.158796
99991966-06-27 10:55:004.414455
99991968-04-02 12:58:001.552491
99991969-06-24 07:19:004.501553
" + ], + "text/plain": [ + "shape: (100_000, 3)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† timestamp โ”† value โ”‚\n", + "โ”‚ --- โ”† --- โ”† --- โ”‚\n", + "โ”‚ i64 โ”† datetime[ฮผs] โ”† f64 โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 0 โ”† 1967-06-12 14:06:00 โ”† 0.174793 โ”‚\n", + "โ”‚ 0 โ”† 1968-04-15 01:45:00 โ”† 3.072293 โ”‚\n", + "โ”‚ 0 โ”† 1968-12-09 05:42:00 โ”† 1.315754 โ”‚\n", + "โ”‚ 0 โ”† 1969-06-20 18:07:00 โ”† 2.812481 โ”‚\n", + "โ”‚ 0 โ”† 1967-11-26 01:59:00 โ”† 2.981185 โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 9999 โ”† 1968-08-19 10:15:00 โ”† 0.671907 โ”‚\n", + "โ”‚ 9999 โ”† 1966-01-03 22:34:00 โ”† 4.158796 โ”‚\n", + "โ”‚ 9999 โ”† 1966-06-27 10:55:00 โ”† 4.414455 โ”‚\n", + "โ”‚ 9999 โ”† 1968-04-02 12:58:00 โ”† 1.552491 โ”‚\n", + "โ”‚ 9999 โ”† 1969-06-24 07:19:00 โ”† 4.501553 โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from timeseriesflattener.testing.load_synth_data import load_synth_predictor_float\n", + "\n", + "df_synth_predictors = load_synth_predictor_float()\n", + "\n", + "skim(df_synth_predictors)\n", + "df_synth_predictors.sort([\"entity_id\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once again, note that there can be multiple values for each ID.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading a static predictor\n", + "\n", + "Frequently, you'll have one or more static predictors describing each entity. In this case, an entity is a patient, and an example of a static outcome could be their sex. It doesn't change over time (it's static), but can be used as a predictor for each prediction time. Let's load it in!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
+                            "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
+                            "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of rows    โ”‚ 9999   โ”‚ โ”‚ int64       โ”‚ 2     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of columns โ”‚ 2      โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                                                  โ”‚\n",
+                            "โ”‚                                                     number                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    0 โ”‚   2500 โ”‚   5000 โ”‚   7500 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ female           โ”‚    0 โ”‚       0 โ”‚    0.5 โ”‚    0.5 โ”‚    0 โ”‚      0 โ”‚      0 โ”‚      1 โ”‚       1 โ”‚  โ–‡    โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
+                            "
\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ Number of rows โ”‚ 9999 โ”‚ โ”‚ int64 โ”‚ 2 โ”‚ โ”‚\n", + "โ”‚ โ”‚ Number of columns โ”‚ 2 โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 7500\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mfemale \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (9_999, 2)
entity_idfemale
i64i64
00
11
21
31
40
50
60
70
81
90
100
110
99881
99890
99901
99910
99920
99930
99941
99950
99960
99971
99981
99990
" + ], + "text/plain": [ + "shape: (9_999, 2)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† female โ”‚\n", + "โ”‚ --- โ”† --- โ”‚\n", + "โ”‚ i64 โ”† i64 โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 0 โ”† 0 โ”‚\n", + "โ”‚ 1 โ”† 1 โ”‚\n", + "โ”‚ 2 โ”† 1 โ”‚\n", + "โ”‚ 3 โ”† 1 โ”‚\n", + "โ”‚ 4 โ”† 0 โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 9995 โ”† 0 โ”‚\n", + "โ”‚ 9996 โ”† 0 โ”‚\n", + "โ”‚ 9997 โ”† 1 โ”‚\n", + "โ”‚ 9998 โ”† 1 โ”‚\n", + "โ”‚ 9999 โ”† 0 โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from timeseriesflattener.testing.load_synth_data import load_synth_sex\n", + "\n", + "df_synth_sex = load_synth_sex()\n", + "\n", + "skim(df_synth_sex)\n", + "df_synth_sex" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the predictor is static, there should only be a single value for each ID in this dataframe.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading a temporal outcome\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And, lastly, our outcome values. We've chosen a binary outcome and only stored values for the timestamps that experience the outcome. From these, we can infer patients that do not experience the outcome, since they do not have a timestamp. We handle this by setting a fallback of 0 - more on that in the following section.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
+                            "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
+                            "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of rows    โ”‚ 3103   โ”‚ โ”‚ int64       โ”‚ 2     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of columns โ”‚ 3      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
+                            "โ”‚                                                     number                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    4 โ”‚   2500 โ”‚   5100 โ”‚   7600 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ value            โ”‚    0 โ”‚       0 โ”‚      1 โ”‚      0 โ”‚    1 โ”‚      1 โ”‚      1 โ”‚      1 โ”‚       1 โ”‚      โ–‡   โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ”‚                                                    datetime                                                     โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-04 07:50:00     โ”‚    1969-12-31 10:15:00     โ”‚ None         โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
+                            "
\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ Number of rows โ”‚ 3103 โ”‚ โ”‚ int64 โ”‚ 2 โ”‚ โ”‚\n", + "โ”‚ โ”‚ Number of columns โ”‚ 3 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 4\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5100\u001b[0m โ”‚ \u001b[36m 7600\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mvalue \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-04 07:50:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 10:15:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (3_103, 3)
entity_idtimestampvalue
i64datetime[ฮผs]i64
17271966-08-01 07:08:001
22691965-09-07 11:14:001
60501969-10-28 22:11:001
22301966-10-06 16:26:001
64491966-02-06 14:18:001
31651965-03-03 18:44:001
96021968-07-01 03:24:001
95691968-02-07 11:17:001
121965-04-17 07:17:001
92241967-12-01 09:23:001
26381966-12-11 07:07:001
34151968-06-20 21:58:001
96791967-12-11 09:25:001
33761969-03-23 02:20:001
83481966-02-10 10:22:001
33701969-07-01 12:05:001
45761968-02-04 19:26:001
36231968-02-13 14:43:001
55911967-03-13 02:57:001
95631969-03-07 08:20:001
8991967-07-30 06:03:001
79791967-02-13 12:39:001
84791965-09-11 10:29:001
16551966-05-21 13:06:001
" + ], + "text/plain": [ + "shape: (3_103, 3)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† timestamp โ”† value โ”‚\n", + "โ”‚ --- โ”† --- โ”† --- โ”‚\n", + "โ”‚ i64 โ”† datetime[ฮผs] โ”† i64 โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 1727 โ”† 1966-08-01 07:08:00 โ”† 1 โ”‚\n", + "โ”‚ 2269 โ”† 1965-09-07 11:14:00 โ”† 1 โ”‚\n", + "โ”‚ 6050 โ”† 1969-10-28 22:11:00 โ”† 1 โ”‚\n", + "โ”‚ 2230 โ”† 1966-10-06 16:26:00 โ”† 1 โ”‚\n", + "โ”‚ 6449 โ”† 1966-02-06 14:18:00 โ”† 1 โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 9563 โ”† 1969-03-07 08:20:00 โ”† 1 โ”‚\n", + "โ”‚ 899 โ”† 1967-07-30 06:03:00 โ”† 1 โ”‚\n", + "โ”‚ 7979 โ”† 1967-02-13 12:39:00 โ”† 1 โ”‚\n", + "โ”‚ 8479 โ”† 1965-09-11 10:29:00 โ”† 1 โ”‚\n", + "โ”‚ 1655 โ”† 1966-05-21 13:06:00 โ”† 1 โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from timeseriesflattener.testing.load_synth_data import load_synth_outcome\n", + "\n", + "df_synth_outcome = load_synth_outcome()\n", + "\n", + "skim(df_synth_outcome)\n", + "df_synth_outcome" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataframe should contain at most 1 row per ID, which is the first time they experience the outcome.\n", + "\n", + "We now have 4 dataframes loaded: df_prediction_times, df_synth_predictors, df_synth_sex and df_synth_outcome.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specifying how to flatten the data\n", + "\n", + "We'll have to specify how to flatten predictors and outcomes. To do this, we use the feature specification objects as \"recipes\" for each column in our finished dataframe. Firstly, we'll specify the outcome specification.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Temporal outcome specification\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/term_a.png)\n", + "\n", + "The main decision to make for outcomes is the size of the **lookahead** window. It determines how far into the future from a given prediction time to look for outcome values.\n", + "A **prediction time** indicates at which point the model issues a prediction, and is used as a reference for the _lookahead_.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/term_b.png)\n", + "\n", + "We want labels for prediction times to be 0 if the outcome never occurs, or if the outcome happens outside the lookahead window. Labels should only be 1 if the outcome occurs inside the lookahead window. Let's specify this in code.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", + "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", + "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" + ] + } + ], + "source": [ + "import datetime as dt\n", + "\n", + "import pandas as pd\n", + "from timeseriesflattener import BooleanOutcomeSpec, TimestampValueFrame, ValueFrame\n", + "from timeseriesflattener.aggregators import MaxAggregator\n", + "\n", + "test_df = pd.DataFrame({\"entity_id\": [0], \"timestamp\": [pd.Timestamp(\"2020-01-01\")]})\n", + "\n", + "outcome_spec = BooleanOutcomeSpec(\n", + " init_frame=TimestampValueFrame(\n", + " entity_id_col_name=\"entity_id\", init_df=test_df, value_timestamp_col_name=\"timestamp\"\n", + " ),\n", + " lookahead_distances=[dt.timedelta(days=365)],\n", + " aggregators=[MaxAggregator()],\n", + " output_name=\"outcome\",\n", + " column_prefix=\"outc\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since our outcome is binary, we want each prediction time to be labeled with 0 for the outcome if none is present within lookahead days. To do this, we use the fallback argument, which specifies the default value to use if none are found in `values_df` within `lookahead`. For the BooleanOutcomeSpec, this is hardcoded to 0.\n", + "\n", + "Your use case determines how you want to handle multiple outcome values within lookahead days. In this case, we decide that any prediction time with at least one outcome (a timestamp in the loaded outcome data with a corresponding value of 1) within the specified lookahead days is \"positive\". I.e., if there is both a 0 and a 1 within lookahead days, the prediction time should be labeled with a 1. We set `aggregators = [MaxAggregator()]` to accomplish this.\n", + "\n", + "Here, we specifiy that we want to look 365 days forward from the prediction time to search for outcomes. If we wanted to require a certain period of time from the prediction time before we look for outcome values, we can specify `lookahead` as an interval of (min_days, max_days) as a tuple instead.\n", + "\n", + "Lastly, we specify a name of the outcome which'll be used when generating its column.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Temporal predictor specification\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specifying a predictor is almost entirely identical to specifying an outcome. The only exception is that it looks a given number of days into the past from each prediction time instead of ahead.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", + "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", + "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" + ] + } + ], + "source": [ + "import numpy as np\n", + "from timeseriesflattener import PredictorSpec, StaticSpec\n", + "from timeseriesflattener.aggregators import MeanAggregator\n", + "\n", + "temporal_predictor_spec = PredictorSpec(\n", + " value_frame=ValueFrame(\n", + " entity_id_col_name=\"entity_id\",\n", + " init_df=df_synth_predictors.rename({\"value\": \"value_1\"}),\n", + " value_timestamp_col_name=\"timestamp\",\n", + " ),\n", + " aggregators=[MeanAggregator()],\n", + " column_prefix=\"pred\",\n", + " fallback=np.nan,\n", + " lookbehind_distances=[dt.timedelta(days=730)],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](img/term_c.png)\n", + "\n", + "Values within the _lookbehind_ window are aggregated using `aggregators`, for example the mean as shown in this example, or max/min etc.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we rename the value column to value_1. The value column's name determines the name of the output column after aggregation. To avoid multiple output columns with the same name, all input value columns must have unique names.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Temporal predictors can also be specified to look for values within a certain time range from the prediction time, similar to outcome specifications. For instance, you might want to create multiple predictors, where one looks for values within (0, 30) days, and another within (31, 182) days.\n", + "\n", + "This can easily be specified by passing a tuple[min_days, max_days] to the lookbehind_days parameter.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", + "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", + "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" + ] + } + ], + "source": [ + "temporal_interval_predictor_spec = PredictorSpec(\n", + " value_frame=ValueFrame(\n", + " entity_id_col_name=\"entity_id\",\n", + " init_df=df_synth_predictors.rename({\"value\": \"value_2\"}),\n", + " value_timestamp_col_name=\"timestamp\",\n", + " ),\n", + " aggregators=[MeanAggregator()],\n", + " column_prefix=\"pred\",\n", + " fallback=np.nan,\n", + " lookbehind_distances=[(dt.timedelta(days=10), dt.timedelta(days=365))],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Static predictor specification\n", + "\n", + "Static features should be specified using `StaticSpec` as they are handled slightly differently. As in the previous specifications, we provide a `values_df` containing the values and we set the feature name. However, now we also add a prefix. By default, `PredictorSpec` prefixes columns with โ€œpredโ€ and `OutcomeSpec` prefixes columns with โ€œoutcโ€ to make filtering easy.\n", + "As `StaticSpec` can be used for both generating predictors and outcomes, we manually set the prefix to be โ€œpredโ€, as sex is used as predictor in this case.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (9_999, 2)
entity_idfemale
i64i64
00
11
21
31
40
50
60
70
81
90
100
110
99881
99890
99901
99910
99920
99930
99941
99950
99960
99971
99981
99990
" + ], + "text/plain": [ + "shape: (9_999, 2)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† female โ”‚\n", + "โ”‚ --- โ”† --- โ”‚\n", + "โ”‚ i64 โ”† i64 โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 0 โ”† 0 โ”‚\n", + "โ”‚ 1 โ”† 1 โ”‚\n", + "โ”‚ 2 โ”† 1 โ”‚\n", + "โ”‚ 3 โ”† 1 โ”‚\n", + "โ”‚ 4 โ”† 0 โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 9995 โ”† 0 โ”‚\n", + "โ”‚ 9996 โ”† 0 โ”‚\n", + "โ”‚ 9997 โ”† 1 โ”‚\n", + "โ”‚ 9998 โ”† 1 โ”‚\n", + "โ”‚ 9999 โ”† 0 โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from timeseriesflattener import StaticFrame\n", + "\n", + "sex_predictor_spec = StaticSpec(\n", + " value_frame=StaticFrame(init_df=df_synth_sex), column_prefix=\"pred\", fallback=np.nan\n", + ")\n", + "\n", + "df_synth_sex" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we don't need to specify which columns to aggregate. Timeseriesflattener aggregates all columns that are not `entity_id_col_name` or `value_timestamp_col_name` and uses the name(s) of the column(s) for the output." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we're ready to flatten our dataset!\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Flattening\n", + "\n", + "Flattening is as easy as instantiating the `TimeseriesFlattener` class with the prediction times df along with dataset specific metadata and calling the `add_*` functions. `n_workers` can be set to parallelize operations across multiple cores.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from timeseriesflattener import Flattener, PredictionTimeFrame\n", + "\n", + "flattener = Flattener(\n", + " predictiontime_frame=PredictionTimeFrame(\n", + " init_df=df_prediction_times, entity_id_col_name=\"entity_id\", timestamp_col_name=\"timestamp\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/rich/live.py:231: UserWarning: \n",
+                            "install \"ipywidgets\" for Jupyter support\n",
+                            "  warnings.warn('install \"ipywidgets\" for Jupyter support')\n",
+                            "
\n" + ], + "text/plain": [ + "/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/rich/live.py:231: UserWarning: \n", + "install \"ipywidgets\" for Jupyter support\n", + " warnings.warn('install \"ipywidgets\" for Jupyter support')\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Processing spec: female\n",
+                            "
\n" + ], + "text/plain": [ + "Processing spec: female\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Processing spec: value_1\n",
+                            "
\n" + ], + "text/plain": [ + "Processing spec: value_1\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Processing spec: value_2\n",
+                            "
\n" + ], + "text/plain": [ + "Processing spec: value_2\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Processing spec: value\n",
+                            "
\n" + ], + "text/plain": [ + "Processing spec: value\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+                        ],
+                        "text/plain": []
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                },
+                {
+                    "data": {
+                        "text/html": [
+                            "
\n",
+                            "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/numpy/lib/histograms.py:885: RuntimeWarning: invalid value encountered in divide\n", + " return n/db/n.sum(), bin_edges\n" + ] + }, + { + "data": { + "text/html": [ + "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
+                            "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
+                            "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of rows    โ”‚ 10000  โ”‚ โ”‚ int64       โ”‚ 3     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ”‚ Number of columns โ”‚ 7      โ”‚ โ”‚ float64     โ”‚ 2     โ”‚                                                          โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚                                โ”‚ string      โ”‚ 1     โ”‚                                                          โ”‚\n",
+                            "โ”‚                                โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
+                            "โ”‚                                                     number                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name             โ”ƒ NA     โ”ƒ NA %  โ”ƒ mean    โ”ƒ sd    โ”ƒ p0    โ”ƒ p25   โ”ƒ p50   โ”ƒ p75  โ”ƒ p100  โ”ƒ hist   โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ entity_id               โ”‚      0 โ”‚     0 โ”‚    5000 โ”‚  2900 โ”‚     0 โ”‚  2500 โ”‚  4900 โ”‚ 7400 โ”‚ 10000 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ pred_female_fallback_na โ”‚      0 โ”‚     0 โ”‚    0.49 โ”‚   0.5 โ”‚     0 โ”‚     0 โ”‚     0 โ”‚    1 โ”‚     1 โ”‚ โ–‡    โ–‡ โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ n                       โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ pred_value_1_within_0_t โ”‚      0 โ”‚     0 โ”‚       5 โ”‚  0.97 โ”‚  0.66 โ”‚   4.3 โ”‚     5 โ”‚  5.6 โ”‚     9 โ”‚  โ–โ–‡โ–‡โ–‚  โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ o_730_days_mean_fallbac โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ k_nan                   โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ pred_value_2_within_10_ โ”‚  10000 โ”‚   100 โ”‚     nan โ”‚   nan โ”‚   nan โ”‚   nan โ”‚   nan โ”‚  nan โ”‚   nan โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ to_365_days_mean_fallba โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ ck_nan                  โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ outc_value_within_0_to_ โ”‚      0 โ”‚     0 โ”‚  0.0001 โ”‚  0.01 โ”‚     0 โ”‚     0 โ”‚     0 โ”‚    0 โ”‚     1 โ”‚   โ–‡    โ”‚  โ”‚\n",
+                            "โ”‚ โ”‚ 365_days_max_fallback_0 โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ”‚                                                    datetime                                                     โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 09:35:00     โ”‚    1969-12-31 21:42:00     โ”‚ None         โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ”‚                                                     string                                                      โ”‚\n",
+                            "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
+                            "โ”‚ โ”ƒ column_name                   โ”ƒ NA     โ”ƒ NA %       โ”ƒ words per row               โ”ƒ total words            โ”ƒ  โ”‚\n",
+                            "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
+                            "โ”‚ โ”‚ pred_time_uuid                โ”‚      0 โ”‚          0 โ”‚                           2 โ”‚                  20000 โ”‚  โ”‚\n",
+                            "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
+                            "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
+                            "
\n" + ], + "text/plain": [ + "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", + "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ Number of rows โ”‚ 10000 โ”‚ โ”‚ int64 โ”‚ 3 โ”‚ โ”‚\n", + "โ”‚ โ”‚ Number of columns โ”‚ 7 โ”‚ โ”‚ float64 โ”‚ 2 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ”‚ string โ”‚ 1 โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 4900\u001b[0m โ”‚ \u001b[36m7400\u001b[0m โ”‚ \u001b[36m10000\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡\u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mpred_female_fallback_na\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.49\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32mโ–‡ โ–‡\u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mn \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mpred_value_1_within_0_t\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 0.97\u001b[0m โ”‚ \u001b[36m 0.66\u001b[0m โ”‚ \u001b[36m 4.3\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 5.6\u001b[0m โ”‚ \u001b[36m 9\u001b[0m โ”‚ \u001b[32m โ–โ–‡โ–‡โ–‚ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mo_730_days_mean_fallbac\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mk_nan \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mpred_value_2_within_10_\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[36m 100\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[32m \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mto_365_days_mean_fallba\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mck_nan \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141moutc_value_within_0_to_\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.0001\u001b[0m โ”‚ \u001b[36m 0.01\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141m365_days_max_fallback_0\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 09:35:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 21:42:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ”‚ \u001b[3m string \u001b[0m โ”‚\n", + "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", + "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", + "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", + "โ”‚ โ”‚ \u001b[38;5;141mpred_time_uuid \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2\u001b[0m โ”‚ \u001b[36m 20000\u001b[0m โ”‚ โ”‚\n", + "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", + "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "['entity_id',\n", + " 'timestamp',\n", + " 'pred_time_uuid',\n", + " 'pred_female_fallback_nan',\n", + " 'pred_value_1_within_0_to_730_days_mean_fallback_nan',\n", + " 'pred_value_2_within_10_to_365_days_mean_fallback_nan',\n", + " 'outc_value_within_0_to_365_days_max_fallback_0']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = flattener.aggregate_timeseries(\n", + " specs=[\n", + " sex_predictor_spec,\n", + " temporal_predictor_spec,\n", + " temporal_interval_predictor_spec,\n", + " outcome_spec,\n", + " ]\n", + ").df\n", + "\n", + "skim(df)\n", + "\n", + "list(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10_000, 7)
entity_idtimestamppred_time_uuidpred_female_fallback_nanpredXpredX_30_to_90outc_Y
i64datetime[ฮผs]stri64f64f64i32
99031968-05-09 21:24:00"9903-1968-05-0โ€ฆ03.197213NaN0
74651966-05-24 01:23:00"7465-1966-05-2โ€ฆ14.243969NaN0
64471967-09-25 18:08:00"6447-1967-09-2โ€ฆ15.260492NaN0
21211966-05-05 20:52:00"2121-1966-05-0โ€ฆ04.798062NaN0
49271968-06-30 12:13:00"4927-1968-06-3โ€ฆ04.040067NaN0
54751967-01-09 03:09:00"5475-1967-01-0โ€ฆ05.953548NaN0
31571969-10-07 05:01:00"3157-1969-10-0โ€ฆ15.068696NaN0
97931968-12-15 12:59:00"9793-1968-12-1โ€ฆ06.93591NaN0
59621965-11-08 17:03:00"5962-1965-11-0โ€ฆ04.112929NaN0
97681967-07-04 23:09:00"9768-1967-07-0โ€ฆ15.053019NaN0
98611969-01-22 17:34:00"9861-1969-01-2โ€ฆ03.828896NaN0
6571969-04-14 15:47:00"657-1969-04-14โ€ฆ05.124361NaN0
65421967-04-15 14:37:00"6542-1967-04-1โ€ฆ16.099213NaN0
56071966-02-14 02:15:00"5607-1966-02-1โ€ฆ04.235413NaN0
42281967-02-26 05:45:00"4228-1967-02-2โ€ฆ05.181058NaN0
83371965-07-14 08:14:00"8337-1965-07-1โ€ฆ05.184373NaN0
97451969-02-04 01:18:00"9745-1969-02-0โ€ฆ16.005186NaN0
72221966-06-07 16:10:00"7222-1966-06-0โ€ฆ04.666051NaN0
33851967-07-17 19:18:00"3385-1967-07-1โ€ฆ16.98234NaN0
71591966-12-12 16:32:00"7159-1966-12-1โ€ฆ05.022668NaN0
1471965-03-12 05:32:00"147-1965-03-12โ€ฆ15.927239NaN0
14211968-04-15 15:53:00"1421-1968-04-1โ€ฆ06.061767NaN0
33531966-01-15 10:04:00"3353-1966-01-1โ€ฆ06.108344NaN0
19401968-05-17 10:49:00"1940-1968-05-1โ€ฆ04.909043NaN0
" + ], + "text/plain": [ + "shape: (10_000, 7)\n", + "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", + "โ”‚ entity_id โ”† timestamp โ”† pred_time_uuid โ”† pred_female_fa โ”† predX โ”† predX_30_to_9 โ”† outc_Y โ”‚\n", + "โ”‚ --- โ”† --- โ”† --- โ”† llback_nan โ”† --- โ”† 0 โ”† --- โ”‚\n", + "โ”‚ i64 โ”† datetime[ฮผs] โ”† str โ”† --- โ”† f64 โ”† --- โ”† i32 โ”‚\n", + "โ”‚ โ”† โ”† โ”† i64 โ”† โ”† f64 โ”† โ”‚\n", + "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", + "โ”‚ 9903 โ”† 1968-05-09 โ”† 9903-1968-05-0 โ”† 0 โ”† 3.197213 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 21:24:00 โ”† 9 21:24:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 7465 โ”† 1966-05-24 โ”† 7465-1966-05-2 โ”† 1 โ”† 4.243969 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 01:23:00 โ”† 4 01:23:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 6447 โ”† 1967-09-25 โ”† 6447-1967-09-2 โ”† 1 โ”† 5.260492 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 18:08:00 โ”† 5 18:08:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 2121 โ”† 1966-05-05 โ”† 2121-1966-05-0 โ”† 0 โ”† 4.798062 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 20:52:00 โ”† 5 20:52:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 4927 โ”† 1968-06-30 โ”† 4927-1968-06-3 โ”† 0 โ”† 4.040067 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 12:13:00 โ”† 0 12:13:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", + "โ”‚ 7159 โ”† 1966-12-12 โ”† 7159-1966-12-1 โ”† 0 โ”† 5.022668 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 16:32:00 โ”† 2 16:32:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 147 โ”† 1965-03-12 โ”† 147-1965-03-12 โ”† 1 โ”† 5.927239 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 05:32:00 โ”† 05:32:00.00000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 0 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 1421 โ”† 1968-04-15 โ”† 1421-1968-04-1 โ”† 0 โ”† 6.061767 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 15:53:00 โ”† 5 15:53:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 3353 โ”† 1966-01-15 โ”† 3353-1966-01-1 โ”† 0 โ”† 6.108344 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 10:04:00 โ”† 5 10:04:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ 1940 โ”† 1968-05-17 โ”† 1940-1968-05-1 โ”† 0 โ”† 4.909043 โ”† NaN โ”† 0 โ”‚\n", + "โ”‚ โ”† 10:49:00 โ”† 7 10:49:00.000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", + "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For displayability, shorten col names\n", + "shortened_pred = \"predX\"\n", + "shortened_predinterval = \"predX_30_to_90\"\n", + "shortened_outcome = \"outc_Y\"\n", + "\n", + "display_df = df.rename(\n", + " {\n", + " \"pred_value_1_within_0_to_730_days_mean_fallback_nan\": shortened_pred,\n", + " \"pred_value_2_within_10_to_365_days_mean_fallback_nan\": shortened_predinterval,\n", + " \"outc_outcome_within_0_to_365_days_max_fallback_0\": shortened_outcome,\n", + " }\n", + ")\n", + "display_df" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And there we go! A dataframe ready for classification, containing:\n", + "\n", + "1. The citizen IDs\n", + "2. Timestamps for each prediction time\n", + "3. A unique identifier for each prediciton-time\n", + "4. Our predictor columns, prefixed with `pred_` and\n", + "5. Our outcome columns, prefixed with `outc_`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d2b49c0af2d95979144de75823f7cfbb268839811992fdd0cb17fc1bb54ce815" + } + } }, - { - "data": { - "text/html": [ - "
\n", - "shape: (10_000, 2)
entity_idtimestamp
i64datetime[ฮผs]
01969-01-11 09:55:00
11965-03-15 07:16:00
21969-09-13 23:18:00
31968-02-04 16:16:00
41965-01-28 12:33:00
51967-10-09 06:22:00
71969-11-17 02:50:00
81965-12-11 02:17:00
91965-08-21 22:00:00
101965-12-26 16:45:00
111966-07-09 13:54:00
121968-01-26 22:35:00
99901965-11-16 21:02:00
99901967-06-09 02:48:00
99911966-07-05 03:18:00
99941965-04-20 06:19:00
99941967-05-07 04:44:00
99951968-08-23 11:30:00
99951965-11-24 05:42:00
99961965-01-30 17:19:00
99961965-07-18 17:12:00
99971967-06-08 07:52:00
99991965-07-19 14:59:00
99991968-02-07 22:24:00
" - ], - "text/plain": [ - "shape: (10_000, 2)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† timestamp โ”‚\n", - "โ”‚ --- โ”† --- โ”‚\n", - "โ”‚ i64 โ”† datetime[ฮผs] โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 0 โ”† 1969-01-11 09:55:00 โ”‚\n", - "โ”‚ 1 โ”† 1965-03-15 07:16:00 โ”‚\n", - "โ”‚ 2 โ”† 1969-09-13 23:18:00 โ”‚\n", - "โ”‚ 3 โ”† 1968-02-04 16:16:00 โ”‚\n", - "โ”‚ 4 โ”† 1965-01-28 12:33:00 โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 9996 โ”† 1965-01-30 17:19:00 โ”‚\n", - "โ”‚ 9996 โ”† 1965-07-18 17:12:00 โ”‚\n", - "โ”‚ 9997 โ”† 1967-06-08 07:52:00 โ”‚\n", - "โ”‚ 9999 โ”† 1965-07-19 14:59:00 โ”‚\n", - "โ”‚ 9999 โ”† 1968-02-07 22:24:00 โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from __future__ import annotations\n", - "\n", - "from skimpy import skim\n", - "from timeseriesflattener.testing.load_synth_data import load_synth_prediction_times\n", - "\n", - "df_prediction_times = load_synth_prediction_times()\n", - "\n", - "skim(df_prediction_times)\n", - "df_prediction_times.sort([\"entity_id\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, \"entity_id\" represents a patient ID and โ€œtimestampโ€ refers to the time when we want to issue a prediction. Note that each ID can have multiple prediction times.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading a temporal predictor\n", - "\n", - "Then, we'll load the values for our temporal predictor. Temporal predictors are predictors that can have a different value at different timepoints.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
-       "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
-       "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of rows    โ”‚ 100000 โ”‚ โ”‚ int64       โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of columns โ”‚ 3      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ float64     โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚                                โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
-       "โ”‚                                                     number                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name     โ”ƒ NA   โ”ƒ NA %   โ”ƒ mean   โ”ƒ sd     โ”ƒ p0        โ”ƒ p25    โ”ƒ p50    โ”ƒ p75   โ”ƒ p100   โ”ƒ hist    โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ entity_id       โ”‚    0 โ”‚      0 โ”‚   5000 โ”‚   2900 โ”‚         0 โ”‚   2500 โ”‚   5000 โ”‚  7500 โ”‚  10000 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ value           โ”‚    0 โ”‚      0 โ”‚      5 โ”‚    2.9 โ”‚   0.00015 โ”‚    2.5 โ”‚      5 โ”‚   7.5 โ”‚     10 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ”‚                                                    datetime                                                     โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 00:01:00     โ”‚    1969-12-31 23:37:00     โ”‚ None         โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
-       "
\n" - ], - "text/plain": [ - "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", - "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ Number of rows โ”‚ 100000 โ”‚ โ”‚ int64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ”‚ Number of columns โ”‚ 3 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ float64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 7500\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mvalue \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 2.9\u001b[0m โ”‚ \u001b[36m 0.00015\u001b[0m โ”‚ \u001b[36m 2.5\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 7.5\u001b[0m โ”‚ \u001b[36m 10\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 00:01:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 23:37:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "shape: (100_000, 3)
entity_idtimestampvalue
i64datetime[ฮผs]f64
01967-06-12 14:06:000.174793
01968-04-15 01:45:003.072293
01968-12-09 05:42:001.315754
01969-06-20 18:07:002.812481
01967-11-26 01:59:002.981185
01968-09-07 01:45:000.173205
01969-02-21 03:29:009.943505
01967-11-26 13:45:005.470792
01967-05-12 12:44:000.970382
01965-05-03 05:23:006.630007
01969-10-19 10:29:000.799246
01969-11-25 18:39:002.868688
99991967-12-11 21:34:005.379503
99991968-05-26 12:28:009.645746
99991966-04-23 02:37:008.924175
99991968-10-24 18:47:002.34615
99991967-11-12 07:33:004.414524
99991966-07-15 20:55:000.684468
99991968-01-08 02:41:003.250538
99991968-08-19 10:15:000.671907
99991966-01-03 22:34:004.158796
99991966-06-27 10:55:004.414455
99991968-04-02 12:58:001.552491
99991969-06-24 07:19:004.501553
" - ], - "text/plain": [ - "shape: (100_000, 3)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† timestamp โ”† value โ”‚\n", - "โ”‚ --- โ”† --- โ”† --- โ”‚\n", - "โ”‚ i64 โ”† datetime[ฮผs] โ”† f64 โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 0 โ”† 1967-06-12 14:06:00 โ”† 0.174793 โ”‚\n", - "โ”‚ 0 โ”† 1968-04-15 01:45:00 โ”† 3.072293 โ”‚\n", - "โ”‚ 0 โ”† 1968-12-09 05:42:00 โ”† 1.315754 โ”‚\n", - "โ”‚ 0 โ”† 1969-06-20 18:07:00 โ”† 2.812481 โ”‚\n", - "โ”‚ 0 โ”† 1967-11-26 01:59:00 โ”† 2.981185 โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 9999 โ”† 1968-08-19 10:15:00 โ”† 0.671907 โ”‚\n", - "โ”‚ 9999 โ”† 1966-01-03 22:34:00 โ”† 4.158796 โ”‚\n", - "โ”‚ 9999 โ”† 1966-06-27 10:55:00 โ”† 4.414455 โ”‚\n", - "โ”‚ 9999 โ”† 1968-04-02 12:58:00 โ”† 1.552491 โ”‚\n", - "โ”‚ 9999 โ”† 1969-06-24 07:19:00 โ”† 4.501553 โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from timeseriesflattener.testing.load_synth_data import load_synth_predictor_float\n", - "\n", - "df_synth_predictors = load_synth_predictor_float()\n", - "\n", - "skim(df_synth_predictors)\n", - "df_synth_predictors.sort([\"entity_id\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once again, note that there can be multiple values for each ID.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading a static predictor\n", - "\n", - "Frequently, you'll have one or more static predictors describing each entity. In this case, an entity is a patient, and an example of a static outcome could be their sex. It doesn't change over time (it's static), but can be used as a predictor for each prediction time. Let's load it in!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
-       "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
-       "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of rows    โ”‚ 9999   โ”‚ โ”‚ int64       โ”‚ 2     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of columns โ”‚ 2      โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                                                  โ”‚\n",
-       "โ”‚                                                     number                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    0 โ”‚   2500 โ”‚   5000 โ”‚   7500 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ female           โ”‚    0 โ”‚       0 โ”‚    0.5 โ”‚    0.5 โ”‚    0 โ”‚      0 โ”‚      0 โ”‚      1 โ”‚       1 โ”‚  โ–‡    โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
-       "
\n" - ], - "text/plain": [ - "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", - "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ Number of rows โ”‚ 9999 โ”‚ โ”‚ int64 โ”‚ 2 โ”‚ โ”‚\n", - "โ”‚ โ”‚ Number of columns โ”‚ 2 โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 7500\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mfemale \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "shape: (9_999, 2)
entity_idfemale
i64i64
00
11
21
31
40
50
60
70
81
90
100
110
99881
99890
99901
99910
99920
99930
99941
99950
99960
99971
99981
99990
" - ], - "text/plain": [ - "shape: (9_999, 2)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† female โ”‚\n", - "โ”‚ --- โ”† --- โ”‚\n", - "โ”‚ i64 โ”† i64 โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 0 โ”† 0 โ”‚\n", - "โ”‚ 1 โ”† 1 โ”‚\n", - "โ”‚ 2 โ”† 1 โ”‚\n", - "โ”‚ 3 โ”† 1 โ”‚\n", - "โ”‚ 4 โ”† 0 โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 9995 โ”† 0 โ”‚\n", - "โ”‚ 9996 โ”† 0 โ”‚\n", - "โ”‚ 9997 โ”† 1 โ”‚\n", - "โ”‚ 9998 โ”† 1 โ”‚\n", - "โ”‚ 9999 โ”† 0 โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from timeseriesflattener.testing.load_synth_data import load_synth_sex\n", - "\n", - "df_synth_sex = load_synth_sex()\n", - "\n", - "skim(df_synth_sex)\n", - "df_synth_sex" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As the predictor is static, there should only be a single value for each ID in this dataframe.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading a temporal outcome\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And, lastly, our outcome values. We've chosen a binary outcome and only stored values for the timestamps that experience the outcome. From these, we can infer patients that do not experience the outcome, since they do not have a timestamp. We handle this by setting a fallback of 0 - more on that in the following section.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
-       "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
-       "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of rows    โ”‚ 3103   โ”‚ โ”‚ int64       โ”‚ 2     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of columns โ”‚ 3      โ”‚ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
-       "โ”‚                                                     number                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ mean   โ”ƒ sd     โ”ƒ p0   โ”ƒ p25    โ”ƒ p50    โ”ƒ p75    โ”ƒ p100    โ”ƒ hist     โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ entity_id        โ”‚    0 โ”‚       0 โ”‚   5000 โ”‚   2900 โ”‚    4 โ”‚   2500 โ”‚   5100 โ”‚   7600 โ”‚   10000 โ”‚  โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡  โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ value            โ”‚    0 โ”‚       0 โ”‚      1 โ”‚      0 โ”‚    1 โ”‚      1 โ”‚      1 โ”‚      1 โ”‚       1 โ”‚      โ–‡   โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ”‚                                                    datetime                                                     โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-04 07:50:00     โ”‚    1969-12-31 10:15:00     โ”‚ None         โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
-       "
\n" - ], - "text/plain": [ - "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", - "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ Number of rows โ”‚ 3103 โ”‚ โ”‚ int64 โ”‚ 2 โ”‚ โ”‚\n", - "โ”‚ โ”‚ Number of columns โ”‚ 3 โ”‚ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 4\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 5100\u001b[0m โ”‚ \u001b[36m 7600\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[32m โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mvalue \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-04 07:50:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 10:15:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "shape: (3_103, 3)
entity_idtimestampvalue
i64datetime[ฮผs]i64
17271966-08-01 07:08:001
22691965-09-07 11:14:001
60501969-10-28 22:11:001
22301966-10-06 16:26:001
64491966-02-06 14:18:001
31651965-03-03 18:44:001
96021968-07-01 03:24:001
95691968-02-07 11:17:001
121965-04-17 07:17:001
92241967-12-01 09:23:001
26381966-12-11 07:07:001
34151968-06-20 21:58:001
96791967-12-11 09:25:001
33761969-03-23 02:20:001
83481966-02-10 10:22:001
33701969-07-01 12:05:001
45761968-02-04 19:26:001
36231968-02-13 14:43:001
55911967-03-13 02:57:001
95631969-03-07 08:20:001
8991967-07-30 06:03:001
79791967-02-13 12:39:001
84791965-09-11 10:29:001
16551966-05-21 13:06:001
" - ], - "text/plain": [ - "shape: (3_103, 3)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† timestamp โ”† value โ”‚\n", - "โ”‚ --- โ”† --- โ”† --- โ”‚\n", - "โ”‚ i64 โ”† datetime[ฮผs] โ”† i64 โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 1727 โ”† 1966-08-01 07:08:00 โ”† 1 โ”‚\n", - "โ”‚ 2269 โ”† 1965-09-07 11:14:00 โ”† 1 โ”‚\n", - "โ”‚ 6050 โ”† 1969-10-28 22:11:00 โ”† 1 โ”‚\n", - "โ”‚ 2230 โ”† 1966-10-06 16:26:00 โ”† 1 โ”‚\n", - "โ”‚ 6449 โ”† 1966-02-06 14:18:00 โ”† 1 โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 9563 โ”† 1969-03-07 08:20:00 โ”† 1 โ”‚\n", - "โ”‚ 899 โ”† 1967-07-30 06:03:00 โ”† 1 โ”‚\n", - "โ”‚ 7979 โ”† 1967-02-13 12:39:00 โ”† 1 โ”‚\n", - "โ”‚ 8479 โ”† 1965-09-11 10:29:00 โ”† 1 โ”‚\n", - "โ”‚ 1655 โ”† 1966-05-21 13:06:00 โ”† 1 โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from timeseriesflattener.testing.load_synth_data import load_synth_outcome\n", - "\n", - "df_synth_outcome = load_synth_outcome()\n", - "\n", - "skim(df_synth_outcome)\n", - "df_synth_outcome" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This dataframe should contain at most 1 row per ID, which is the first time they experience the outcome.\n", - "\n", - "We now have 4 dataframes loaded: df_prediction_times, df_synth_predictors, df_synth_sex and df_synth_outcome.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Specifying how to flatten the data\n", - "\n", - "We'll have to specify how to flatten predictors and outcomes. To do this, we use the feature specification objects as \"recipes\" for each column in our finished dataframe. Firstly, we'll specify the outcome specification.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Temporal outcome specification\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](img/term_a.png)\n", - "\n", - "The main decision to make for outcomes is the size of the **lookahead** window. It determines how far into the future from a given prediction time to look for outcome values.\n", - "A **prediction time** indicates at which point the model issues a prediction, and is used as a reference for the _lookahead_.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](img/term_b.png)\n", - "\n", - "We want labels for prediction times to be 0 if the outcome never occurs, or if the outcome happens outside the lookahead window. Labels should only be 1 if the outcome occurs inside the lookahead window. Let's specify this in code.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "import datetime as dt\n", - "\n", - "import pandas as pd\n", - "from timeseriesflattener import BooleanOutcomeSpec, TimestampValueFrame, ValueFrame\n", - "from timeseriesflattener.aggregators import MaxAggregator\n", - "\n", - "test_df = pd.DataFrame({\"entity_id\": [0], \"timestamp\": [pd.Timestamp(\"2020-01-01\")]})\n", - "\n", - "outcome_spec = BooleanOutcomeSpec(\n", - " init_frame=TimestampValueFrame(\n", - " entity_id_col_name=\"entity_id\", init_df=test_df, value_timestamp_col_name=\"timestamp\"\n", - " ),\n", - " lookahead_distances=[dt.timedelta(days=365)],\n", - " aggregators=[MaxAggregator()],\n", - " output_name=\"outcome\",\n", - " column_prefix=\"outc\",\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since our outcome is binary, we want each prediction time to be labeled with 0 for the outcome if none is present within lookahead days. To do this, we use the fallback argument, which specifies the default value to use if none are found in `values_df` within `lookahead`. For the BooleanOutcomeSpec, this is hardcoded to 0.\n", - "\n", - "Your use case determines how you want to handle multiple outcome values within lookahead days. In this case, we decide that any prediction time with at least one outcome (a timestamp in the loaded outcome data with a corresponding value of 1) within the specified lookahead days is \"positive\". I.e., if there is both a 0 and a 1 within lookahead days, the prediction time should be labeled with a 1. We set `aggregators = [MaxAggregator()]` to accomplish this.\n", - "\n", - "Here, we specifiy that we want to look 365 days forward from the prediction time to search for outcomes. If we wanted to require a certain period of time from the prediction time before we look for outcome values, we can specify `lookahead` as an interval of (min_days, max_days) as a tuple instead.\n", - "\n", - "Lastly, we specify a name of the outcome which'll be used when generating its column.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Temporal predictor specification\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Specifying a predictor is almost entirely identical to specifying an outcome. The only exception is that it looks a given number of days into the past from each prediction time instead of ahead.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "import numpy as np\n", - "from timeseriesflattener import PredictorSpec, StaticSpec\n", - "from timeseriesflattener.aggregators import MeanAggregator\n", - "\n", - "temporal_predictor_spec = PredictorSpec(\n", - " value_frame=ValueFrame(\n", - " entity_id_col_name=\"entity_id\",\n", - " init_df=df_synth_predictors.rename({\"value\": \"value_1\"}),\n", - " value_timestamp_col_name=\"timestamp\",\n", - " ),\n", - " aggregators=[MeanAggregator()],\n", - " column_prefix=\"pred\",\n", - " fallback=np.nan,\n", - " lookbehind_distances=[dt.timedelta(days=730)],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](img/term_c.png)\n", - "\n", - "Values within the _lookbehind_ window are aggregated using `aggregators`, for example the mean as shown in this example, or max/min etc.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we rename the value column to value_1. The value column's name determines the name of the output column after aggregation. To avoid multiple output columns with the same name, all input value columns must have unique names.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Temporal predictors can also be specified to look for values within a certain time range from the prediction time, similar to outcome specifications. For instance, you might want to create multiple predictors, where one looks for values within (0, 30) days, and another within (31, 182) days.\n", - "\n", - "This can easily be specified by passing a tuple[min_days, max_days] to the lookbehind_days parameter.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with '.venv' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "temporal_interval_predictor_spec = PredictorSpec(\n", - " value_frame=ValueFrame(\n", - " entity_id_col_name=\"entity_id\",\n", - " init_df=df_synth_predictors.rename({\"value\": \"value_2\"}),\n", - " value_timestamp_col_name=\"timestamp\",\n", - " ),\n", - " aggregators=[MeanAggregator()],\n", - " column_prefix=\"pred\",\n", - " fallback=np.nan,\n", - " lookbehind_distances=[(dt.timedelta(days=10), dt.timedelta(days=365))],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Static predictor specification\n", - "\n", - "Static features should be specified using `StaticSpec` as they are handled slightly differently. As in the previous specifications, we provide a `values_df` containing the values and we set the feature name. However, now we also add a prefix. By default, `PredictorSpec` prefixes columns with โ€œpredโ€ and `OutcomeSpec` prefixes columns with โ€œoutcโ€ to make filtering easy.\n", - "As `StaticSpec` can be used for both generating predictors and outcomes, we manually set the prefix to be โ€œpredโ€, as sex is used as predictor in this case.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (9_999, 2)
entity_idfemale
i64i64
00
11
21
31
40
50
60
70
81
90
100
110
99881
99890
99901
99910
99920
99930
99941
99950
99960
99971
99981
99990
" - ], - "text/plain": [ - "shape: (9_999, 2)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† female โ”‚\n", - "โ”‚ --- โ”† --- โ”‚\n", - "โ”‚ i64 โ”† i64 โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 0 โ”† 0 โ”‚\n", - "โ”‚ 1 โ”† 1 โ”‚\n", - "โ”‚ 2 โ”† 1 โ”‚\n", - "โ”‚ 3 โ”† 1 โ”‚\n", - "โ”‚ 4 โ”† 0 โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 9995 โ”† 0 โ”‚\n", - "โ”‚ 9996 โ”† 0 โ”‚\n", - "โ”‚ 9997 โ”† 1 โ”‚\n", - "โ”‚ 9998 โ”† 1 โ”‚\n", - "โ”‚ 9999 โ”† 0 โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from timeseriesflattener import StaticFrame\n", - "\n", - "sex_predictor_spec = StaticSpec(\n", - " value_frame=StaticFrame(init_df=df_synth_sex), column_prefix=\"pred\", fallback=np.nan\n", - ")\n", - "\n", - "df_synth_sex" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that we don't need to specify which columns to aggregate. Timeseriesflattener aggregates all columns that are not `entity_id_col_name` or `value_timestamp_col_name` and uses the name(s) of the column(s) for the output." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we're ready to flatten our dataset!\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Flattening\n", - "\n", - "Flattening is as easy as instantiating the `TimeseriesFlattener` class with the prediction times df along with dataset specific metadata and calling the `add_*` functions. `n_workers` can be set to parallelize operations across multiple cores.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from timeseriesflattener import Flattener, PredictionTimeFrame\n", - "\n", - "flattener = Flattener(\n", - " predictiontime_frame=PredictionTimeFrame(\n", - " init_df=df_prediction_times, entity_id_col_name=\"entity_id\", timestamp_col_name=\"timestamp\"\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/rich/live.py:231: UserWarning: \n",
-       "install \"ipywidgets\" for Jupyter support\n",
-       "  warnings.warn('install \"ipywidgets\" for Jupyter support')\n",
-       "
\n" - ], - "text/plain": [ - "/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/rich/live.py:231: UserWarning: \n", - "install \"ipywidgets\" for Jupyter support\n", - " warnings.warn('install \"ipywidgets\" for Jupyter support')\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Processing spec: female\n",
-       "
\n" - ], - "text/plain": [ - "Processing spec: female\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Processing spec: value_1\n",
-       "
\n" - ], - "text/plain": [ - "Processing spec: value_1\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Processing spec: value_2\n",
-       "
\n" - ], - "text/plain": [ - "Processing spec: value_2\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Processing spec: value\n",
-       "
\n" - ], - "text/plain": [ - "Processing spec: value\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/au484925/Desktop/Git/timeseriesflattener/.venv/lib/python3.9/site-packages/numpy/lib/histograms.py:885: RuntimeWarning: invalid value encountered in divide\n", - " return n/db/n.sum(), bin_edges\n" - ] - }, - { - "data": { - "text/html": [ - "
โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n",
-       "โ”‚          Data Summary                Data Types                                                                 โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“                                                          โ”‚\n",
-       "โ”‚ โ”ƒ dataframe         โ”ƒ Values โ”ƒ โ”ƒ Column Type โ”ƒ Count โ”ƒ                                                          โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of rows    โ”‚ 10000  โ”‚ โ”‚ int64       โ”‚ 3     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ”‚ Number of columns โ”‚ 7      โ”‚ โ”‚ float64     โ”‚ 2     โ”‚                                                          โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ datetime64  โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚                                โ”‚ string      โ”‚ 1     โ”‚                                                          โ”‚\n",
-       "โ”‚                                โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜                                                          โ”‚\n",
-       "โ”‚                                                     number                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name             โ”ƒ NA     โ”ƒ NA %  โ”ƒ mean    โ”ƒ sd    โ”ƒ p0    โ”ƒ p25   โ”ƒ p50   โ”ƒ p75  โ”ƒ p100  โ”ƒ hist   โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ entity_id               โ”‚      0 โ”‚     0 โ”‚    5000 โ”‚  2900 โ”‚     0 โ”‚  2500 โ”‚  4900 โ”‚ 7400 โ”‚ 10000 โ”‚ โ–‡โ–‡โ–‡โ–‡โ–‡โ–‡ โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ pred_female_fallback_na โ”‚      0 โ”‚     0 โ”‚    0.49 โ”‚   0.5 โ”‚     0 โ”‚     0 โ”‚     0 โ”‚    1 โ”‚     1 โ”‚ โ–‡    โ–‡ โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ n                       โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ pred_value_1_within_0_t โ”‚      0 โ”‚     0 โ”‚       5 โ”‚  0.97 โ”‚  0.66 โ”‚   4.3 โ”‚     5 โ”‚  5.6 โ”‚     9 โ”‚  โ–โ–‡โ–‡โ–‚  โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ o_730_days_mean_fallbac โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ k_nan                   โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ pred_value_2_within_10_ โ”‚  10000 โ”‚   100 โ”‚     nan โ”‚   nan โ”‚   nan โ”‚   nan โ”‚   nan โ”‚  nan โ”‚   nan โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ to_365_days_mean_fallba โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ ck_nan                  โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ outc_value_within_0_to_ โ”‚      0 โ”‚     0 โ”‚  0.0001 โ”‚  0.01 โ”‚     0 โ”‚     0 โ”‚     0 โ”‚    0 โ”‚     1 โ”‚   โ–‡    โ”‚  โ”‚\n",
-       "โ”‚ โ”‚ 365_days_max_fallback_0 โ”‚        โ”‚       โ”‚         โ”‚       โ”‚       โ”‚       โ”‚       โ”‚      โ”‚       โ”‚        โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ”‚                                                    datetime                                                     โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name      โ”ƒ NA   โ”ƒ NA %    โ”ƒ first                      โ”ƒ last                       โ”ƒ frequency    โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ timestamp        โ”‚    0 โ”‚       0 โ”‚    1965-01-02 09:35:00     โ”‚    1969-12-31 21:42:00     โ”‚ None         โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ”‚                                                     string                                                      โ”‚\n",
-       "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“  โ”‚\n",
-       "โ”‚ โ”ƒ column_name                   โ”ƒ NA     โ”ƒ NA %       โ”ƒ words per row               โ”ƒ total words            โ”ƒ  โ”‚\n",
-       "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ  โ”‚\n",
-       "โ”‚ โ”‚ pred_time_uuid                โ”‚      0 โ”‚          0 โ”‚                           2 โ”‚                  20000 โ”‚  โ”‚\n",
-       "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜  โ”‚\n",
-       "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n",
-       "
\n" - ], - "text/plain": [ - "โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ skimpy summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ\n", - "โ”‚ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ Number of rows โ”‚ 10000 โ”‚ โ”‚ int64 โ”‚ 3 โ”‚ โ”‚\n", - "โ”‚ โ”‚ Number of columns โ”‚ 7 โ”‚ โ”‚ float64 โ”‚ 2 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ datetime64 โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ”‚ string โ”‚ 1 โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m number \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp50 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mentity_id \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5000\u001b[0m โ”‚ \u001b[36m 2900\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2500\u001b[0m โ”‚ \u001b[36m 4900\u001b[0m โ”‚ \u001b[36m7400\u001b[0m โ”‚ \u001b[36m10000\u001b[0m โ”‚ \u001b[32mโ–‡โ–‡โ–‡โ–‡โ–‡โ–‡\u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mpred_female_fallback_na\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.49\u001b[0m โ”‚ \u001b[36m 0.5\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32mโ–‡ โ–‡\u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mn \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mpred_value_1_within_0_t\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 0.97\u001b[0m โ”‚ \u001b[36m 0.66\u001b[0m โ”‚ \u001b[36m 4.3\u001b[0m โ”‚ \u001b[36m 5\u001b[0m โ”‚ \u001b[36m 5.6\u001b[0m โ”‚ \u001b[36m 9\u001b[0m โ”‚ \u001b[32m โ–โ–‡โ–‡โ–‚ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mo_730_days_mean_fallbac\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mk_nan \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mpred_value_2_within_10_\u001b[0m โ”‚ \u001b[36m 10000\u001b[0m โ”‚ \u001b[36m 100\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[36m nan\u001b[0m โ”‚ \u001b[32m \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mto_365_days_mean_fallba\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mck_nan \u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141moutc_value_within_0_to_\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0.0001\u001b[0m โ”‚ \u001b[36m 0.01\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 1\u001b[0m โ”‚ \u001b[32m โ–‡ \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141m365_days_max_fallback_0\u001b[0m โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m datetime \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mtimestamp \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[31m 1965-01-02 09:35:00 \u001b[0m โ”‚ \u001b[31m 1969-12-31 21:42:00 \u001b[0m โ”‚ \u001b[38;5;141mNone \u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ”‚ \u001b[3m string \u001b[0m โ”‚\n", - "โ”‚ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“ โ”‚\n", - "โ”‚ โ”ƒ\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0mโ”ƒ\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0mโ”ƒ โ”‚\n", - "โ”‚ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ โ”‚\n", - "โ”‚ โ”‚ \u001b[38;5;141mpred_time_uuid \u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 0\u001b[0m โ”‚ \u001b[36m 2\u001b[0m โ”‚ \u001b[36m 20000\u001b[0m โ”‚ โ”‚\n", - "โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚\n", - "โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ End โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "['entity_id',\n", - " 'timestamp',\n", - " 'pred_time_uuid',\n", - " 'pred_female_fallback_nan',\n", - " 'pred_value_1_within_0_to_730_days_mean_fallback_nan',\n", - " 'pred_value_2_within_10_to_365_days_mean_fallback_nan',\n", - " 'outc_value_within_0_to_365_days_max_fallback_0']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = flattener.aggregate_timeseries(\n", - " specs=[\n", - " sex_predictor_spec,\n", - " temporal_predictor_spec,\n", - " temporal_interval_predictor_spec,\n", - " outcome_spec,\n", - " ]\n", - ").df.collect()\n", - "\n", - "skim(df)\n", - "\n", - "list(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (10_000, 7)
entity_idtimestamppred_time_uuidpred_female_fallback_nanpredXpredX_30_to_90outc_Y
i64datetime[ฮผs]stri64f64f64i32
99031968-05-09 21:24:00"9903-1968-05-0โ€ฆ03.197213NaN0
74651966-05-24 01:23:00"7465-1966-05-2โ€ฆ14.243969NaN0
64471967-09-25 18:08:00"6447-1967-09-2โ€ฆ15.260492NaN0
21211966-05-05 20:52:00"2121-1966-05-0โ€ฆ04.798062NaN0
49271968-06-30 12:13:00"4927-1968-06-3โ€ฆ04.040067NaN0
54751967-01-09 03:09:00"5475-1967-01-0โ€ฆ05.953548NaN0
31571969-10-07 05:01:00"3157-1969-10-0โ€ฆ15.068696NaN0
97931968-12-15 12:59:00"9793-1968-12-1โ€ฆ06.93591NaN0
59621965-11-08 17:03:00"5962-1965-11-0โ€ฆ04.112929NaN0
97681967-07-04 23:09:00"9768-1967-07-0โ€ฆ15.053019NaN0
98611969-01-22 17:34:00"9861-1969-01-2โ€ฆ03.828896NaN0
6571969-04-14 15:47:00"657-1969-04-14โ€ฆ05.124361NaN0
65421967-04-15 14:37:00"6542-1967-04-1โ€ฆ16.099213NaN0
56071966-02-14 02:15:00"5607-1966-02-1โ€ฆ04.235413NaN0
42281967-02-26 05:45:00"4228-1967-02-2โ€ฆ05.181058NaN0
83371965-07-14 08:14:00"8337-1965-07-1โ€ฆ05.184373NaN0
97451969-02-04 01:18:00"9745-1969-02-0โ€ฆ16.005186NaN0
72221966-06-07 16:10:00"7222-1966-06-0โ€ฆ04.666051NaN0
33851967-07-17 19:18:00"3385-1967-07-1โ€ฆ16.98234NaN0
71591966-12-12 16:32:00"7159-1966-12-1โ€ฆ05.022668NaN0
1471965-03-12 05:32:00"147-1965-03-12โ€ฆ15.927239NaN0
14211968-04-15 15:53:00"1421-1968-04-1โ€ฆ06.061767NaN0
33531966-01-15 10:04:00"3353-1966-01-1โ€ฆ06.108344NaN0
19401968-05-17 10:49:00"1940-1968-05-1โ€ฆ04.909043NaN0
" - ], - "text/plain": [ - "shape: (10_000, 7)\n", - "โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”\n", - "โ”‚ entity_id โ”† timestamp โ”† pred_time_uuid โ”† pred_female_fa โ”† predX โ”† predX_30_to_9 โ”† outc_Y โ”‚\n", - "โ”‚ --- โ”† --- โ”† --- โ”† llback_nan โ”† --- โ”† 0 โ”† --- โ”‚\n", - "โ”‚ i64 โ”† datetime[ฮผs] โ”† str โ”† --- โ”† f64 โ”† --- โ”† i32 โ”‚\n", - "โ”‚ โ”† โ”† โ”† i64 โ”† โ”† f64 โ”† โ”‚\n", - "โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•ก\n", - "โ”‚ 9903 โ”† 1968-05-09 โ”† 9903-1968-05-0 โ”† 0 โ”† 3.197213 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 21:24:00 โ”† 9 21:24:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 7465 โ”† 1966-05-24 โ”† 7465-1966-05-2 โ”† 1 โ”† 4.243969 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 01:23:00 โ”† 4 01:23:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 6447 โ”† 1967-09-25 โ”† 6447-1967-09-2 โ”† 1 โ”† 5.260492 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 18:08:00 โ”† 5 18:08:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 2121 โ”† 1966-05-05 โ”† 2121-1966-05-0 โ”† 0 โ”† 4.798062 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 20:52:00 โ”† 5 20:52:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 4927 โ”† 1968-06-30 โ”† 4927-1968-06-3 โ”† 0 โ”† 4.040067 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 12:13:00 โ”† 0 12:13:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”† โ€ฆ โ”‚\n", - "โ”‚ 7159 โ”† 1966-12-12 โ”† 7159-1966-12-1 โ”† 0 โ”† 5.022668 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 16:32:00 โ”† 2 16:32:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 147 โ”† 1965-03-12 โ”† 147-1965-03-12 โ”† 1 โ”† 5.927239 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 05:32:00 โ”† 05:32:00.00000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 0 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 1421 โ”† 1968-04-15 โ”† 1421-1968-04-1 โ”† 0 โ”† 6.061767 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 15:53:00 โ”† 5 15:53:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 3353 โ”† 1966-01-15 โ”† 3353-1966-01-1 โ”† 0 โ”† 6.108344 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 10:04:00 โ”† 5 10:04:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ 1940 โ”† 1968-05-17 โ”† 1940-1968-05-1 โ”† 0 โ”† 4.909043 โ”† NaN โ”† 0 โ”‚\n", - "โ”‚ โ”† 10:49:00 โ”† 7 10:49:00.000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ”‚ โ”† โ”† 000 โ”† โ”† โ”† โ”† โ”‚\n", - "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# For displayability, shorten col names\n", - "shortened_pred = \"predX\"\n", - "shortened_predinterval = \"predX_30_to_90\"\n", - "shortened_outcome = \"outc_Y\"\n", - "\n", - "display_df = df.rename(\n", - " {\n", - " \"pred_value_1_within_0_to_730_days_mean_fallback_nan\": shortened_pred,\n", - " \"pred_value_2_within_10_to_365_days_mean_fallback_nan\": shortened_predinterval,\n", - " \"outc_outcome_within_0_to_365_days_max_fallback_0\": shortened_outcome,\n", - " }\n", - ")\n", - "display_df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And there we go! A dataframe ready for classification, containing:\n", - "\n", - "1. The citizen IDs\n", - "2. Timestamps for each prediction time\n", - "3. A unique identifier for each prediciton-time\n", - "4. Our predictor columns, prefixed with `pred_` and\n", - "5. Our outcome columns, prefixed with `outc_`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "d2b49c0af2d95979144de75823f7cfbb268839811992fdd0cb17fc1bb54ce815" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/docs/tutorials/02_advanced.ipynb b/docs/tutorials/02_advanced.ipynb index a6fe3812..f26f82da 100644 --- a/docs/tutorials/02_advanced.ipynb +++ b/docs/tutorials/02_advanced.ipynb @@ -206,7 +206,7 @@ " )\n", ")\n", "\n", - "df = flattener.aggregate_timeseries(specs=[predictor_spec]).df.collect()" + "df = flattener.aggregate_timeseries(specs=[predictor_spec]).df" ] }, { @@ -491,7 +491,7 @@ } ], "source": [ - "df = flattener.aggregate_timeseries(specs=[predictor_spec]).df.collect()\n", + "df = flattener.aggregate_timeseries(specs=[predictor_spec]).df\n", "\n", "df.head()" ] @@ -676,7 +676,7 @@ } ], "source": [ - "df = flattener.aggregate_timeseries(specs=[age_spec]).df.collect()\n", + "df = flattener.aggregate_timeseries(specs=[age_spec]).df\n", "\n", "df.head()" ] @@ -757,4 +757,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/docs/tutorials/03_text.ipynb b/docs/tutorials/03_text.ipynb index b2c36fea..f05175a5 100644 --- a/docs/tutorials/03_text.ipynb +++ b/docs/tutorials/03_text.ipynb @@ -278,7 +278,7 @@ " )\n", ")\n", "\n", - "df = flattener.aggregate_timeseries(specs=[text_spec]).df.collect()" + "df = flattener.aggregate_timeseries(specs=[text_spec]).df" ] }, { @@ -392,4 +392,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/example.py b/example.py index 3e2579d5..6cdf991c 100644 --- a/example.py +++ b/example.py @@ -32,7 +32,7 @@ predictor_spec = PredictorSpec( value_frame=ValueFrame( - init_df=predictor_df.lazy(), entity_id_col_name="id", value_timestamp_col_name="date" + init_df=predictor_df, entity_id_col_name="id", value_timestamp_col_name="date" ), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MaxAggregator(), MinAggregator()], @@ -42,7 +42,7 @@ outcome_spec = OutcomeSpec( value_frame=ValueFrame( - init_df=outcome_df.lazy(), entity_id_col_name="id", value_timestamp_col_name="date" + init_df=outcome_df, entity_id_col_name="id", value_timestamp_col_name="date" ), lookahead_distances=[dt.timedelta(days=1)], aggregators=[MaxAggregator(), MinAggregator()], @@ -55,7 +55,7 @@ result = Flattener( predictiontime_frame=PredictionTimeFrame( - init_df=prediction_times_df.lazy(), entity_id_col_name="id", timestamp_col_name="date" + init_df=prediction_times_df, entity_id_col_name="id", timestamp_col_name="date" ) ).aggregate_timeseries(specs=[predictor_spec, outcome_spec]) -result.collect() +result # noqa: B018 diff --git a/pyproject.toml b/pyproject.toml index ef0b294f..a870ffe4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "numpy>=1.23.3", "pyarrow>=8.0.0", "protobuf<=4.24.4", - # Other versions give errors with pytest, super weird! + # Other versions of protobuf give errors with pytest "frozendict>=2.3.4", "coloredlogs>14.0.0", "tqdm>4.1.0", diff --git a/src/test_benchmark.py b/src/test_benchmark.py index 590aa125..6e091ff9 100644 --- a/src/test_benchmark.py +++ b/src/test_benchmark.py @@ -22,7 +22,7 @@ def _generate_valueframe(n_obseravations: int, feature_name: str) -> ValueFrame: return ValueFrame( - init_df=pl.LazyFrame( + init_df=pl.DataFrame( { "entity_id": list(range(n_obseravations)), feature_name: [random.random() for _ in range(n_obseravations)], @@ -53,7 +53,7 @@ def _generate_benchmark_dataset( lookbehinds: Sequence[dt.timedelta | tuple[dt.timedelta, dt.timedelta]], ) -> BenchmarkDataset: pred_time_df = PredictionTimeFrame( - init_df=pl.LazyFrame( + init_df=pl.DataFrame( { "entity_id": list(range(n_pred_times)), "pred_timestamp": [ @@ -141,7 +141,7 @@ def test_bench( lookbehinds=[dt.timedelta(days=i + 1) for i in range(example.n_lookbehinds)], ) - flattener = Flattener(predictiontime_frame=dataset.pred_time_frame, compute_lazily=False) + flattener = Flattener(predictiontime_frame=dataset.pred_time_frame) try: diff --git a/src/timeseriesflattener/aggregators_test.py b/src/timeseriesflattener/aggregators_test.py index 3b3fc4b2..07748eaf 100644 --- a/src/timeseriesflattener/aggregators_test.py +++ b/src/timeseriesflattener/aggregators_test.py @@ -34,7 +34,7 @@ @dataclass(frozen=True) class ComplexAggregatorExample: aggregator: Aggregator - input_frame: pl.LazyFrame + input_frame: pl.DataFrame expected_output: pl.DataFrame @@ -46,8 +46,8 @@ class SingleVarAggregatorExample: fallback_str: str = "nan" @property - def input_frame(self) -> pl.LazyFrame: - return pl.LazyFrame( + def input_frame(self) -> pl.DataFrame: + return pl.DataFrame( { "prediction_time_uuid": [1] * len(self.input_values), "value": self.input_values, @@ -110,7 +110,7 @@ def expected_output(self) -> pl.DataFrame: 1,2013-01-01,1 1,2013-01-02,3 """ - ).lazy(), + ), expected_output=str_to_pl_df( """prediction_time_uuid,value_slope_fallback_nan 1,2.0, @@ -125,7 +125,7 @@ def expected_output(self) -> pl.DataFrame: 1,2013-01-02,2, # Dropped, second value in 1 2,2013-01-04,3, # Dropped, second value in 2 2,2013-01-03,4, # Kept, first value in 2""" - ).lazy(), + ), expected_output=str_to_pl_df( """prediction_time_uuid,value_earliest_fallback_nan 1,1, @@ -140,7 +140,7 @@ def expected_output(self) -> pl.DataFrame: 1,2013-01-02,2, # Kept, second value in 1 2,2013-01-04,3, # Kept, second value in 2 2,2013-01-03,4, # Dropped, first value in 2""" - ).lazy(), + ), expected_output=str_to_pl_df( """prediction_time_uuid,value_latest_fallback_nan 1,2, @@ -162,7 +162,7 @@ def test_aggregator(example: AggregatorExampleType): fallback=np.nan if example.aggregator.name != "bool" else False, ) - assert_frame_equal(result.collect(), example.expected_output) + assert_frame_equal(result, example.expected_output) @pytest.mark.parametrize( diff --git a/src/timeseriesflattener/intermediary.py b/src/timeseriesflattener/intermediary.py index 7413dc08..30f5ada7 100644 --- a/src/timeseriesflattener/intermediary.py +++ b/src/timeseriesflattener/intermediary.py @@ -1,12 +1,11 @@ from __future__ import annotations -from dataclasses import InitVar, dataclass +from dataclasses import dataclass from typing import TYPE_CHECKING import polars as pl from .validators import validate_col_name_columns_exist -from .utils import anyframe_to_lazyframe if TYPE_CHECKING: from collections.abc import Sequence @@ -19,7 +18,7 @@ class TimeMaskedFrame: """A frame that has had all values outside its lookbehind and lookahead distances masked.""" - init_df: pl.LazyFrame + init_df: pl.DataFrame value_col_names: Sequence[str] timestamp_col_name: str = "timestamp" prediction_time_uuid_col_name: str = "prediction_time_uuid" @@ -30,16 +29,13 @@ def __post_init__(self): validate_col_name_columns_exist(obj=self) @property - def df(self) -> pl.LazyFrame: + def df(self) -> pl.DataFrame: return self.init_df - def collect(self) -> pl.DataFrame: - return self.init_df.collect() - @dataclass class AggregatedValueFrame: - df: pl.LazyFrame + df: pl.DataFrame value_col_name: str prediction_time_uuid_col_name: str = "prediction_time_uuid" @@ -62,12 +58,12 @@ def fill_nulls(self, fallback: int | float | str | None) -> AggregatedValueFrame def collect(self) -> pl.DataFrame: if isinstance(self.df, pl.DataFrame): return self.df - return self.df.collect() + return self.df @dataclass class TimeDeltaFrame: - df: pl.LazyFrame + df: pl.DataFrame value_col_names: Sequence[str] value_timestamp_col_name: str prediction_time_uuid_col_name: str = "prediction_time_uuid" @@ -77,10 +73,7 @@ def __post_init__(self): validate_col_name_columns_exist(obj=self) def get_timedeltas(self) -> Sequence[dt.datetime]: - return self.collect().get_column(self.timedelta_col_name).to_list() - - def collect(self) -> pl.DataFrame: - return self.df.collect() + return self.df.get_column(self.timedelta_col_name).to_list() @dataclass @@ -94,26 +87,18 @@ class AggregatedFrame: prediction_time_uuid_col_name: The name of the column containing the prediction time uuids. Must be a string, and the column's values must be strings which are unique. """ - init_df: InitVar[pl.LazyFrame] + df: pl.DataFrame entity_id_col_name: str timestamp_col_name: str prediction_time_uuid_col_name: str - def __post_init__(self, init_df: pl.LazyFrame): - self.df = anyframe_to_lazyframe(init_df) - - def collect(self) -> pl.DataFrame: - if isinstance(self.df, pl.DataFrame): - return self.df - return self.df.collect() - @dataclass(frozen=True) class ProcessedFrame: - df: pl.LazyFrame + df: pl.DataFrame prediction_time_uuid_col_name: str def collect(self) -> pl.DataFrame: if isinstance(self.df, pl.DataFrame): return self.df - return self.df.collect() + return self.df diff --git a/src/timeseriesflattener/main.py b/src/timeseriesflattener/main.py index ef1b4554..ca14a549 100644 --- a/src/timeseriesflattener/main.py +++ b/src/timeseriesflattener/main.py @@ -34,7 +34,7 @@ class SpecError(Exception): description: str -def _get_spec_conflicts(specs: Sequence[ValueSpecification]) -> Iter[SpecError]: +def _get_spec_conflicts(specs: Sequence[ValueSpecification]) -> list[SpecError]: conflicting_value_col_names = ( Iter(specs) .map(lambda s: s.value_frame.value_col_names) @@ -48,7 +48,7 @@ def _get_spec_conflicts(specs: Sequence[ValueSpecification]) -> Iter[SpecError]: ) ) - return conflicting_value_col_names + return conflicting_value_col_names.to_list() @dataclass(frozen=True) @@ -69,7 +69,7 @@ def missing_columns(self) -> Iter[str]: def _specs_contain_required_columns( specs: Sequence[ValueSpecification], predictiontime_frame: PredictionTimeFrame -) -> Iter[MissingColumnNameError]: +) -> list[MissingColumnNameError]: missing_col_names = ( Iter(specs) .map( @@ -86,77 +86,34 @@ def _specs_contain_required_columns( ) ) - return missing_col_names + return missing_col_names.to_list() @dataclass class Flattener: predictiontime_frame: PredictionTimeFrame - compute_lazily: bool = False n_workers: int | None = None """Flatten multiple irregular time series to a static feature set. Args: predictiontime_frame: A frame that contains the prediction times. - compute_lazily: If True, the computation will be done lazily. n_workers: The number of workers to use for multiprocessing. If None, multiprocessing will be handled entirely by polars, otherwise, - specify the number of workers to use with joblib. """ + multiple processes will be used with joblib. + Multiprocessing adds some performance at the cost of memory pressure.""" - def aggregate_timeseries( + def _process_specs( self, specs: Sequence[ValueSpecification], step_size: dt.timedelta | None = None - ) -> AggregatedFrame: - """Perform the aggregation/flattening. - - Args: - specs: The specifications for the features to be created. - step_size: The step size for the aggregation. - If not None, will aggregate prediction times in chunks of step_size. - Reduce if you encounter memory issues.""" - if self.compute_lazily: - print( - "We have encountered performance issues on Windows when using lazy evaluation. If you encounter performance issues, try setting lazy=False." - ) - - # Check for conflicts in the specs - conflicting_specs = _get_spec_conflicts(specs).to_list() - underspecified_specs = _specs_contain_required_columns( - specs=specs, predictiontime_frame=self.predictiontime_frame - ).to_list() - errors = Iter([conflicting_specs, underspecified_specs]).flatten() - - if errors.count() > 0: - raise SpecError( - "Conflicting specs." - + "".join(errors.map(lambda error: f" \n - {error.description}").to_list()) # type: ignore - ) - - if not self.compute_lazily: - self.predictiontime_frame.df = self.predictiontime_frame.collect() # type: ignore - for spec in specs: - spec.value_frame.df = spec.value_frame.collect() # type: ignore - else: - self.predictiontime_frame.df = self.predictiontime_frame.df.lazy() - for spec in specs: - spec.value_frame.df = spec.value_frame.df.lazy() - - self.predictiontime_frame.df = self.predictiontime_frame.df.sort( - self.predictiontime_frame.timestamp_col_name - ) # type: ignore - + ) -> Sequence[pl.DataFrame]: # Process and collect the specs. One-by-one, to get feedback on progress. - dfs: Sequence[pl.LazyFrame] = [] + dfs: Sequence[pl.DataFrame] = [] if self.n_workers is None: for spec in track(specs, description="Processing specs..."): print(f"Processing spec: {spec.value_frame.value_col_names}") processed_spec = process_spec( predictiontime_frame=self.predictiontime_frame, spec=spec, step_size=step_size ) - - if isinstance(processed_spec.df, pl.LazyFrame): - dfs.append(processed_spec.collect().lazy()) - else: - dfs.append(processed_spec.df) + dfs.append(processed_spec.df) else: print( "Processing specs with multiprocessing. Note that this multiplies memory pressure by the number of workers. If you run out of memory, try reducing the number of workers, or relying exclusively on Polars paralellisation or setting it to None." @@ -170,14 +127,42 @@ def aggregate_timeseries( ) dfs = [value_frame.df for value_frame in value_frames] + return dfs + + def aggregate_timeseries( + self, specs: Sequence[ValueSpecification], step_size: dt.timedelta | None = None + ) -> AggregatedFrame: + """Perform the aggregation/flattening. + + Args: + specs: The specifications for the features to be created. + step_size: The chunk size for prediction time processing. + If None, will process all prediction times in one go. + If not None, will process prediction times in chunks of step_size. + Smaller chunk sizes will reduce memory pressure, but increase processing time. + """ + + # Check for errors in specs + errors = _get_spec_conflicts(specs) + _specs_contain_required_columns( + specs=specs, predictiontime_frame=self.predictiontime_frame + ) + + if len(errors) > 0: + raise SpecError( + "Conflicting specs." + + "".join(Iter(errors).map(lambda error: f" \n - {error.description}").to_list()) + ) + + dfs = self._process_specs(specs=specs, step_size=step_size) + feature_dfs = horizontally_concatenate_dfs( dfs, prediction_time_uuid_col_name=self.predictiontime_frame.prediction_time_uuid_col_name, ) return AggregatedFrame( - init_df=horizontally_concatenate_dfs( - [self.predictiontime_frame.df, feature_dfs], # type: ignore + df=horizontally_concatenate_dfs( + [self.predictiontime_frame.df, feature_dfs], prediction_time_uuid_col_name=self.predictiontime_frame.prediction_time_uuid_col_name, ), entity_id_col_name=self.predictiontime_frame.entity_id_col_name, diff --git a/src/timeseriesflattener/main_test.py b/src/timeseriesflattener/main_test.py index 271b282e..88e7cdf3 100644 --- a/src/timeseriesflattener/main_test.py +++ b/src/timeseriesflattener/main_test.py @@ -24,14 +24,14 @@ from collections.abc import Sequence FakePredictiontimeFrame = PredictionTimeFrame( - init_df=pl.LazyFrame({"entity_id": [1], "pred_timestamp": ["2021-01-03"]}) + init_df=pl.DataFrame({"entity_id": [1], "pred_timestamp": ["2021-01-03"]}) ) FakeValueFrame = ValueFrame( - init_df=pl.LazyFrame({"entity_id": [1], "value": [1], "timestamp": ["2021-01-01"]}) + init_df=pl.DataFrame({"entity_id": [1], "value": [1], "timestamp": ["2021-01-01"]}) ) FakePredictorSpec = PredictorSpec( value_frame=ValueFrame( - init_df=pl.LazyFrame( + init_df=pl.DataFrame( {"entity_id": [1], "FakeValueColName": [1], "timestamp": ["2021-01-01"]} ) ), @@ -62,8 +62,8 @@ class FlattenerExample: @pytest.mark.parametrize( ("example"), [ - FlattenerExample(should="work with lazy flattening", lazy=True), - FlattenerExample(should="work with eager flattening", lazy=False), + FlattenerExample(should="work with lazy flattening"), + FlattenerExample(should="work with eager flattening"), FlattenerExample(should="work with multiprocessing", n_workers=2), ], ids=lambda example: example.should, @@ -82,13 +82,11 @@ def main_tests(example: FlattenerExample): ) result = Flattener( - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()), - compute_lazily=example.lazy, - n_workers=example.n_workers, + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame), n_workers=example.n_workers ).aggregate_timeseries( specs=[ PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.lazy()), + value_frame=ValueFrame(init_df=value_frame), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MeanAggregator()], fallback=np.nan, @@ -101,7 +99,7 @@ def main_tests(example: FlattenerExample): 1,2021-01-03 00:00:00.000000,1-2021-01-03 00:00:00.000000,3.0""" ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result.df, expected) def test_keep_prediction_times_without_predictors(): @@ -116,11 +114,11 @@ def test_keep_prediction_times_without_predictors(): ) result = Flattener( - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()) + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame) ).aggregate_timeseries( specs=[ PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.lazy()), + value_frame=ValueFrame(init_df=value_frame), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MeanAggregator(), EarliestAggregator(timestamp_col_name="timestamp")], fallback=123, @@ -136,7 +134,7 @@ def test_keep_prediction_times_without_predictors(): } ) - assert_frame_equal(result.collect(), expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) def main_tests_multiple_features(): @@ -153,17 +151,17 @@ def main_tests_multiple_features(): ) result = Flattener( - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()) + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame) ).aggregate_timeseries( specs=[ PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.rename({"value": "value_1"}).lazy()), + value_frame=ValueFrame(init_df=value_frame.rename({"value": "value_1"})), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MeanAggregator()], fallback=np.nan, ), PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.rename({"value": "value_2"}).lazy()), + value_frame=ValueFrame(init_df=value_frame.rename({"value": "value_2"})), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MeanAggregator()], fallback=np.nan, @@ -176,7 +174,7 @@ def main_tests_multiple_features(): 1-2021-01-03 00:00:00.000000,3.0,3.0""" ) - assert_frame_equal(result.collect(), expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_error_if_conflicting_value_col_names(): @@ -192,7 +190,7 @@ def test_error_if_missing_entity_id_column(): ): Flattener( predictiontime_frame=PredictionTimeFrame( - init_df=pl.LazyFrame( + init_df=pl.DataFrame( { "no_entity_id": [1, 2, 3], "pred_timestamp": ["2013-01-01", "2013-01-01", "2013-01-01"], @@ -205,7 +203,7 @@ def test_error_if_missing_entity_id_column(): def test_error_if_missing_column_in_valueframe(): with pytest.raises(SpecColumnError, match="Missing columns: *"): - ValueFrame(init_df=pl.LazyFrame({"value": [1], "timestamp": ["2021-01-01"]})) + ValueFrame(init_df=pl.DataFrame({"value": [1], "timestamp": ["2021-01-01"]})) def test_predictor_with_interval_lookperiod(): @@ -232,7 +230,7 @@ def test_predictor_with_interval_lookperiod(): """prediction_time_uuid,pred_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result.collect(), expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_outcome_with_interval_lookperiod(): @@ -259,7 +257,7 @@ def test_outcome_with_interval_lookperiod(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result.collect(), expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_add_static_spec(): @@ -286,7 +284,7 @@ def test_add_static_spec(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result.collect(), expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_add_features_with_non_default_entity_id_col_name(): @@ -317,7 +315,7 @@ def test_add_features_with_non_default_entity_id_col_name(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result.collect(), expected, ignore_colums=["dw_ek_borger", "pred_timestamp"]) + assert_frame_equal(result, expected, ignore_colums=["dw_ek_borger", "pred_timestamp"]) @pytest.mark.parametrize("step_size", [None, dt.timedelta(days=30)]) @@ -360,7 +358,7 @@ def test_multiple_features_with_unordered_prediction_times(step_size): """ ).sort("prediction_time_uuid") assert_frame_equal( - result.df.collect().sort("prediction_time_uuid"), + result.df.sort("prediction_time_uuid"), expected, ignore_colums=["entity_id", "pred_timestamp"], ) diff --git a/src/timeseriesflattener/processors/static_test.py b/src/timeseriesflattener/processors/static_test.py index 201d4f94..95a13a2f 100644 --- a/src/timeseriesflattener/processors/static_test.py +++ b/src/timeseriesflattener/processors/static_test.py @@ -37,7 +37,7 @@ def test_process_static_spec(): """ ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result, expected) def test_process_static_spec_multiple_values(): @@ -65,4 +65,4 @@ def test_process_static_spec_multiple_values(): 2-2021-01-01 00:00:00.000000,c,d """ ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result, expected) diff --git a/src/timeseriesflattener/processors/temporal.py b/src/timeseriesflattener/processors/temporal.py index 9da6158d..8c8c2c6d 100644 --- a/src/timeseriesflattener/processors/temporal.py +++ b/src/timeseriesflattener/processors/temporal.py @@ -55,8 +55,8 @@ def _get_timedelta_frame( def _null_values_outside_lookwindow( - df: pl.LazyFrame, lookwindow_predicate: pl.Expr, cols_to_null: Sequence[str] -) -> pl.LazyFrame: + df: pl.DataFrame, lookwindow_predicate: pl.Expr, cols_to_null: Sequence[str] +) -> pl.DataFrame: for col_to_null in cols_to_null: df = df.with_columns( pl.when(lookwindow_predicate).then(pl.col(col_to_null)).otherwise(None) @@ -108,7 +108,7 @@ def _aggregate_masked_frame( masked_frame: TimeMaskedFrame, aggregators: Sequence[Aggregator], fallback: int | float | str | None, -) -> pl.LazyFrame: +) -> pl.DataFrame: aggregator_expressions = [ aggregator(value_col_name) for aggregator in aggregators @@ -138,12 +138,12 @@ def _aggregate_masked_frame( TimeMasker = Callable[[TimeDeltaFrame], TimeMaskedFrame] -MaskedAggregator = Callable[[TimeMaskedFrame], pl.LazyFrame] +MaskedAggregator = Callable[[TimeMaskedFrame], pl.DataFrame] def _slice_and_aggregate_spec( timedelta_frame: TimeDeltaFrame, masked_aggregator: MaskedAggregator, time_masker: TimeMasker -) -> pl.LazyFrame: +) -> pl.DataFrame: sliced_frame = time_masker(timedelta_frame) return masked_aggregator(sliced_frame) @@ -152,8 +152,8 @@ def _slice_and_aggregate_spec( def _get_pred_time_range(frame: PredictionTimeFrame) -> tuple[dt.datetime, dt.datetime]: - if isinstance(frame.df, pl.LazyFrame): - df = frame.df.collect() + if isinstance(frame.df, pl.DataFrame): + df = frame.df else: df = frame.df @@ -219,7 +219,7 @@ def _create_step_frames( def _flatten_temporal_spec( spec: TemporalSpec, predictiontime_frame: PredictionTimeFrame, value_frame: ValueFrame -) -> list[pl.LazyFrame]: +) -> list[pl.DataFrame]: return ( Iter(spec.normalised_lookperiod) .map( diff --git a/src/timeseriesflattener/processors/temporal_test.py b/src/timeseriesflattener/processors/temporal_test.py index e4ba151d..f58bc039 100644 --- a/src/timeseriesflattener/processors/temporal_test.py +++ b/src/timeseriesflattener/processors/temporal_test.py @@ -22,7 +22,7 @@ def test_aggregate_over_fallback(): masked_frame = TimeMaskedFrame( validate_cols_exist=False, - init_df=pl.LazyFrame( + init_df=pl.DataFrame( { "prediction_time_uuid": ["1-2021-01-03", "1-2021-01-03"], "value": [None, None], @@ -41,13 +41,13 @@ def test_aggregate_over_fallback(): 1-2021-01-03,0""" ) - assert_frame_equal(aggregated_values.collect(), expected) + assert_frame_equal(aggregated_values, expected) def test_aggregate_with_null(): masked_frame = TimeMaskedFrame( validate_cols_exist=False, - init_df=pl.LazyFrame( + init_df=pl.DataFrame( { "prediction_time_uuid": ["1-2021-01-03", "1-2021-01-03"], "value": [1, None], @@ -66,7 +66,7 @@ def test_aggregate_with_null(): 1-2021-01-03,1""" ) - assert_frame_equal(aggregated_values.collect(), expected) + assert_frame_equal(aggregated_values, expected) def test_aggregate_within_slice(): @@ -78,7 +78,7 @@ def test_aggregate_within_slice(): 1-2021-01-03,2 2-2021-01-03,2 2-2021-01-03,4""" - ).lazy(), + ), value_col_names=["value"], ) @@ -92,7 +92,7 @@ def test_aggregate_within_slice(): 2-2021-01-03,3""" ) - assert_frame_equal(aggregated_values.collect(), expected) + assert_frame_equal(aggregated_values, expected) def test_get_timedelta_frame(): @@ -111,8 +111,8 @@ def test_get_timedelta_frame(): expected_timedeltas = [dt.timedelta(days=-2), dt.timedelta(days=-1), dt.timedelta(days=0)] result = process_spec._get_timedelta_frame( - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()), - value_frame=ValueFrame(init_df=value_frame.lazy()), + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame), + value_frame=ValueFrame(init_df=value_frame), ) assert result.get_timedeltas() == expected_timedeltas @@ -135,9 +135,9 @@ def test_get_timedelta_frame_same_timestamp_col_names(): result = process_spec._get_timedelta_frame( predictiontime_frame=PredictionTimeFrame( - init_df=pred_frame.lazy(), timestamp_col_name="timestamp" + init_df=pred_frame, timestamp_col_name="timestamp" ), - value_frame=ValueFrame(init_df=value_frame.lazy()), + value_frame=ValueFrame(init_df=value_frame), ) assert result.get_timedeltas() == expected_timedeltas @@ -145,7 +145,7 @@ def test_get_timedelta_frame_same_timestamp_col_names(): def test_slice_without_any_within_window(): timedelta_frame = TimeDeltaFrame( - df=pl.LazyFrame( + df=pl.DataFrame( { "prediction_time_uuid": [1, 1, 2, 2], "time_from_prediction_to_value": [ @@ -167,13 +167,13 @@ def test_slice_without_any_within_window(): lookperiod=LookPeriod(first=dt.timedelta(days=-2), last=dt.timedelta(days=0)), column_prefix="pred", value_col_names=["is_null"], - ).collect() + ) from polars.testing import assert_series_equal assert_series_equal( result.get_column("pred_is_null_within_0_to_2_days"), - timedelta_frame.df.collect().get_column("is_null"), + timedelta_frame.df.get_column("is_null"), check_names=False, check_dtypes=False, ) @@ -188,7 +188,7 @@ def test_multiple_aggregators(): 1-2021-01-03,2 2-2021-01-03,2 2-2021-01-03,4""" - ).lazy(), + ), value_col_names=["value"], ) @@ -202,7 +202,7 @@ def test_multiple_aggregators(): 2-2021-01-03,3,4""" ) - assert_frame_equal(aggregated_values.collect(), expected) + assert_frame_equal(aggregated_values, expected) def test_masking_multiple_values_multiple_aggregators(): @@ -214,7 +214,7 @@ def test_masking_multiple_values_multiple_aggregators(): 1-2021-01-03,2,np.nan 2-2021-01-03,2,np.nan 2-2021-01-03,4,np.nan""" - ).lazy(), + ), value_col_names=["value_1", "value_2"], ) @@ -228,7 +228,7 @@ def test_masking_multiple_values_multiple_aggregators(): 2-2021-01-03,3,0,4,0""" ) - assert_frame_equal(aggregated_values.collect(), expected) + assert_frame_equal(aggregated_values, expected) def test_process_time_from_event_spec(): @@ -261,7 +261,7 @@ def test_process_time_from_event_spec(): """ ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result, expected) def test_process_temporal_spec_multiple_values(): @@ -276,19 +276,19 @@ def test_process_temporal_spec_multiple_values(): result = process_spec.process_temporal_spec( spec=PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.lazy()), + value_frame=ValueFrame(init_df=value_frame), lookbehind_distances=[dt.timedelta(days=1)], aggregators=[MeanAggregator()], fallback=0, ), - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()), + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame), ) expected = str_to_pl_df( """prediction_time_uuid,pred_value_1_within_0_to_1_days_mean_fallback_0,pred_value_2_within_0_to_1_days_mean_fallback_0 1-2021-01-01 00:00:00.000000,1,2""" ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result, expected) def test_sliding_window(): @@ -317,7 +317,7 @@ def test_sliding_window(): result = process_spec.process_temporal_spec( spec=PredictorSpec( - value_frame=ValueFrame(init_df=value_frame.lazy()), + value_frame=ValueFrame(init_df=value_frame), lookbehind_distances=[ dt.timedelta(days=10), dt.timedelta(days=365), @@ -325,7 +325,7 @@ def test_sliding_window(): aggregators=[MeanAggregator()], fallback=0, ), - predictiontime_frame=PredictionTimeFrame(init_df=pred_frame.lazy()), + predictiontime_frame=PredictionTimeFrame(init_df=pred_frame), step_size=dt.timedelta(days=365), ) @@ -339,4 +339,4 @@ def test_sliding_window(): 1-2022-01-01 00:00:00.000000,0.0,11.5""" ) - assert_frame_equal(result.collect(), expected) + assert_frame_equal(result, expected) diff --git a/src/timeseriesflattener/specs/outcome.py b/src/timeseriesflattener/specs/outcome.py index c020c1ba..18622081 100644 --- a/src/timeseriesflattener/specs/outcome.py +++ b/src/timeseriesflattener/specs/outcome.py @@ -41,7 +41,7 @@ def __post_init__( ) @property - def df(self) -> pl.LazyFrame: + def df(self) -> pl.DataFrame: return self.value_frame.df @@ -79,5 +79,5 @@ def __post_init__(self, init_frame: TimestampValueFrame): ) @property - def df(self) -> pl.LazyFrame: + def df(self) -> pl.DataFrame: return self.value_frame.df diff --git a/src/timeseriesflattener/specs/prediction_times.py b/src/timeseriesflattener/specs/prediction_times.py index fe4ebdf2..2520b5a5 100644 --- a/src/timeseriesflattener/specs/prediction_times.py +++ b/src/timeseriesflattener/specs/prediction_times.py @@ -7,7 +7,7 @@ import polars as pl from ..validators import validate_col_name_columns_exist -from ..utils import anyframe_to_lazyframe +from ..utils import anyframe_to_pl_frame if TYPE_CHECKING: from collections.abc import Sequence @@ -22,21 +22,16 @@ class PredictionTimeFrame: timestamp_col_name: The name of the column containing the timestamps for when to make a prediction. """ - init_df: InitVar[pl.LazyFrame | pl.DataFrame | pd.DataFrame] + init_df: InitVar[pl.DataFrame | pd.DataFrame] entity_id_col_name: str = "entity_id" timestamp_col_name: str = "pred_timestamp" prediction_time_uuid_col_name: str = "prediction_time_uuid" - coerce_to_lazy: InitVar[bool] = True - def __post_init__( - self, init_df: pl.LazyFrame | pl.DataFrame | pd.DataFrame, coerce_to_lazy: bool - ): - if coerce_to_lazy: - self.df = anyframe_to_lazyframe(init_df) - else: - self.df: pl.LazyFrame = init_df # type: ignore + def __post_init__(self, init_df: pl.DataFrame | pd.DataFrame): + # Sort to ensure alignment when processing multiple specs and concatenating in the end. + self.df = anyframe_to_pl_frame(init_df).sort(self.timestamp_col_name) - self.df = self.df.with_columns( # type: ignore + self.df = self.df.with_columns( pl.concat_str( pl.col(self.entity_id_col_name), pl.lit("-"), pl.col(self.timestamp_col_name) ).alias(self.prediction_time_uuid_col_name) @@ -45,9 +40,7 @@ def __post_init__( validate_col_name_columns_exist(obj=self) def collect(self) -> pl.DataFrame: - if isinstance(self.df, pl.DataFrame): - return self.df - return self.df.collect() + return self.df def required_columns(self) -> Sequence[str]: return [self.entity_id_col_name] diff --git a/src/timeseriesflattener/specs/static.py b/src/timeseriesflattener/specs/static.py index 848449f3..63f53c6a 100644 --- a/src/timeseriesflattener/specs/static.py +++ b/src/timeseriesflattener/specs/static.py @@ -6,24 +6,24 @@ import polars as pl from timeseriesflattener.validators import validate_col_name_columns_exist -from ..utils import anyframe_to_lazyframe +from ..utils import anyframe_to_pl_frame @dataclass class StaticFrame: - init_df: InitVar[pl.LazyFrame | pl.DataFrame | pd.DataFrame] + init_df: InitVar[pl.DataFrame | pd.DataFrame] entity_id_col_name: str = "entity_id" - def __post_init__(self, init_df: pl.LazyFrame | pl.DataFrame | pd.DataFrame): - self.df = anyframe_to_lazyframe(init_df) + def __post_init__(self, init_df: pl.DataFrame | pd.DataFrame): + self.df = anyframe_to_pl_frame(init_df) validate_col_name_columns_exist(obj=self) self.value_col_names = [col for col in self.df.columns if col != self.entity_id_col_name] def collect(self) -> pl.DataFrame: if isinstance(self.df, pl.DataFrame): return self.df - return self.df.collect() + return self.df @dataclass(frozen=True) diff --git a/src/timeseriesflattener/specs/temporal.py b/src/timeseriesflattener/specs/temporal.py index f44c17c8..829aeaeb 100644 --- a/src/timeseriesflattener/specs/temporal.py +++ b/src/timeseriesflattener/specs/temporal.py @@ -46,5 +46,5 @@ def __post_init__( ) @property - def df(self) -> pl.LazyFrame: + def df(self) -> pl.DataFrame: return self.value_frame.df diff --git a/src/timeseriesflattener/specs/test_specs.py b/src/timeseriesflattener/specs/test_specs.py index 79abbdaa..4ef58eb5 100644 --- a/src/timeseriesflattener/specs/test_specs.py +++ b/src/timeseriesflattener/specs/test_specs.py @@ -14,7 +14,7 @@ from .timestamp import TimestampValueFrame MockValueFrame = ValueFrame( - init_df=pl.LazyFrame({"value": [1], "timestamp": ["2021-01-01"], "entity_id": [1]}) + init_df=pl.DataFrame({"value": [1], "timestamp": ["2021-01-01"], "entity_id": [1]}) ) @@ -52,7 +52,7 @@ def test_timedelta_spec_error_if_non_unique_ids(): with pytest.raises(ValueError, match=".*Expected only one value.*"): TimeDeltaSpec( init_frame=TimestampValueFrame( - init_df=pl.LazyFrame( + init_df=pl.DataFrame( {"timestamp": ["2021-01-01", "2021-01-02"], "entity_id": [1, 1]} ), value_timestamp_col_name="timestamp", diff --git a/src/timeseriesflattener/specs/timedelta.py b/src/timeseriesflattener/specs/timedelta.py index 1e919ad2..9d7b74af 100644 --- a/src/timeseriesflattener/specs/timedelta.py +++ b/src/timeseriesflattener/specs/timedelta.py @@ -34,10 +34,7 @@ class TimeDeltaSpec: def __post_init__(self): validate_col_name_columns_exist(obj=self) max_values_per_id = ( - self.init_frame.collect() - .get_column(self.init_frame.entity_id_col_name) - .unique_counts() - .max() + self.init_frame.get_column(self.init_frame.entity_id_col_name).unique_counts().max() ) if max_values_per_id > 1: # type: ignore raise ValueError( @@ -53,5 +50,5 @@ def __post_init__(self): self.value_frame.value_col_names = [self.output_name] @property - def df(self) -> pl.LazyFrame: + def df(self) -> pl.DataFrame: return self.value_frame.df diff --git a/src/timeseriesflattener/specs/timestamp.py b/src/timeseriesflattener/specs/timestamp.py index 20429e94..7381939e 100644 --- a/src/timeseriesflattener/specs/timestamp.py +++ b/src/timeseriesflattener/specs/timestamp.py @@ -6,7 +6,7 @@ import polars as pl from timeseriesflattener.validators import validate_col_name_columns_exist -from ..utils import anyframe_to_lazyframe +from ..utils import anyframe_to_pl_frame @dataclass @@ -18,15 +18,15 @@ class TimestampValueFrame: value_timestamp_col_name: The name of the column containing the timestamps. Must be a string, and the column's values must be datetimes. """ - init_df: InitVar[pl.LazyFrame | pl.DataFrame | pd.DataFrame] + init_df: InitVar[pl.DataFrame | pd.DataFrame] entity_id_col_name: str = "entity_id" value_timestamp_col_name: str = "timestamp" - def __post_init__(self, init_df: pl.LazyFrame | pl.DataFrame | pd.DataFrame): - self.df = anyframe_to_lazyframe(init_df) + def __post_init__(self, init_df: pl.DataFrame | pd.DataFrame): + self.df = anyframe_to_pl_frame(init_df) validate_col_name_columns_exist(obj=self) def collect(self) -> pl.DataFrame: if isinstance(self.df, pl.DataFrame): return self.df - return self.df.collect() + return self.df diff --git a/src/timeseriesflattener/specs/value.py b/src/timeseriesflattener/specs/value.py index 12069a6e..9eb6fe9d 100644 --- a/src/timeseriesflattener/specs/value.py +++ b/src/timeseriesflattener/specs/value.py @@ -8,7 +8,7 @@ import polars as pl from ..validators import validate_col_name_columns_exist -from ..utils import anyframe_to_lazyframe +from ..utils import anyframe_to_pl_frame @dataclass @@ -21,18 +21,12 @@ class ValueFrame: Additional columns containing the values of the time series. The name of the columns will be used for feature naming. """ - init_df: InitVar[pl.LazyFrame | pl.DataFrame | pd.DataFrame] + init_df: InitVar[pl.DataFrame | pd.DataFrame] entity_id_col_name: str = "entity_id" value_timestamp_col_name: str = "timestamp" - coerce_to_lazy: InitVar[bool] = True - def __post_init__( - self, init_df: pl.LazyFrame | pl.DataFrame | pd.DataFrame, coerce_to_lazy: bool - ): - if coerce_to_lazy: - self.df = anyframe_to_lazyframe(init_df) - else: - self.df: pl.LazyFrame = init_df # type: ignore + def __post_init__(self, init_df: pl.DataFrame | pd.DataFrame): + self.df = anyframe_to_pl_frame(init_df) validate_col_name_columns_exist(obj=self) self.value_col_names = [ @@ -41,11 +35,6 @@ def __post_init__( if col not in [self.entity_id_col_name, self.value_timestamp_col_name] ] - def collect(self) -> pl.DataFrame: - if isinstance(self.df, pl.DataFrame): - return self.df - return self.df.collect() - @dataclass(frozen=True) class LookPeriod: diff --git a/src/timeseriesflattener/utils.py b/src/timeseriesflattener/utils.py index 3e7791b4..bb3d6cd4 100644 --- a/src/timeseriesflattener/utils.py +++ b/src/timeseriesflattener/utils.py @@ -19,8 +19,8 @@ def horizontally_concatenate_dfs( - dfs: Sequence[pl.LazyFrame], prediction_time_uuid_col_name: str -) -> pl.LazyFrame: + dfs: Sequence[pl.DataFrame], prediction_time_uuid_col_name: str +) -> pl.DataFrame: dfs_without_identifiers = ( Iter(dfs).map(lambda df: df.drop([prediction_time_uuid_col_name])).to_list() ) @@ -28,11 +28,9 @@ def horizontally_concatenate_dfs( return pl.concat([dfs[0], *dfs_without_identifiers[1:]], how="horizontal") -def anyframe_to_lazyframe(init_df: pl.LazyFrame | pl.DataFrame | pd.DataFrame) -> pl.LazyFrame: - if isinstance(init_df, pl.LazyFrame): - return init_df +def anyframe_to_pl_frame(init_df: pl.DataFrame | pd.DataFrame) -> pl.DataFrame: if isinstance(init_df, pl.DataFrame): - return init_df.lazy() + return init_df if isinstance(init_df, pd.DataFrame): - return pl.from_pandas(init_df).lazy() + return pl.from_pandas(init_df) raise ValueError(f"Unsupported type: {type(init_df)}.") From f9ff3c0953fb89eb9cc5ff8b3c7934e574280fa2 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Jun 2024 13:09:58 +0200 Subject: [PATCH 2/3] update 7 files --- example.py | 2 +- src/timeseriesflattener/main.py | 6 +++++- src/timeseriesflattener/main_test.py | 12 ++++++------ src/timeseriesflattener/processors/static_test.py | 4 ++-- src/timeseriesflattener/processors/temporal.py | 2 -- src/timeseriesflattener/processors/temporal_test.py | 8 ++++---- src/timeseriesflattener/specs/timedelta.py | 2 +- 7 files changed, 19 insertions(+), 17 deletions(-) diff --git a/example.py b/example.py index 6cdf991c..a7b0a90e 100644 --- a/example.py +++ b/example.py @@ -58,4 +58,4 @@ init_df=prediction_times_df, entity_id_col_name="id", timestamp_col_name="date" ) ).aggregate_timeseries(specs=[predictor_spec, outcome_spec]) -result # noqa: B018 +result # type: ignore # noqa: B018 diff --git a/src/timeseriesflattener/main.py b/src/timeseriesflattener/main.py index ca14a549..811ae01b 100644 --- a/src/timeseriesflattener/main.py +++ b/src/timeseriesflattener/main.py @@ -100,7 +100,11 @@ class Flattener: n_workers: The number of workers to use for multiprocessing. If None, multiprocessing will be handled entirely by polars, otherwise, multiple processes will be used with joblib. - Multiprocessing adds some performance at the cost of memory pressure.""" + Multiprocessing adds some performance at the cost of memory pressure. + Note that we already attempted multi-threaded processing with Polars, but the query + optimiser took an infinite amount of time to optimise the query, + so we removed it after commit 73772874802940b6b1e17c110b9c06aa4dd5f8fb. + """ def _process_specs( self, specs: Sequence[ValueSpecification], step_size: dt.timedelta | None = None diff --git a/src/timeseriesflattener/main_test.py b/src/timeseriesflattener/main_test.py index 88e7cdf3..eb26b33b 100644 --- a/src/timeseriesflattener/main_test.py +++ b/src/timeseriesflattener/main_test.py @@ -134,7 +134,7 @@ def test_keep_prediction_times_without_predictors(): } ) - assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["entity_id", "pred_timestamp"]) def main_tests_multiple_features(): @@ -174,7 +174,7 @@ def main_tests_multiple_features(): 1-2021-01-03 00:00:00.000000,3.0,3.0""" ) - assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_error_if_conflicting_value_col_names(): @@ -230,7 +230,7 @@ def test_predictor_with_interval_lookperiod(): """prediction_time_uuid,pred_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_outcome_with_interval_lookperiod(): @@ -257,7 +257,7 @@ def test_outcome_with_interval_lookperiod(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_add_static_spec(): @@ -284,7 +284,7 @@ def test_add_static_spec(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result, expected, ignore_colums=["entity_id", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["entity_id", "pred_timestamp"]) def test_add_features_with_non_default_entity_id_col_name(): @@ -315,7 +315,7 @@ def test_add_features_with_non_default_entity_id_col_name(): """prediction_time_uuid,outc_value_within_5_to_30_days_mean_fallback_nan 1-2022-01-01 00:00:00.000000,1""" ) - assert_frame_equal(result, expected, ignore_colums=["dw_ek_borger", "pred_timestamp"]) + assert_frame_equal(result.df, expected, ignore_colums=["dw_ek_borger", "pred_timestamp"]) @pytest.mark.parametrize("step_size", [None, dt.timedelta(days=30)]) diff --git a/src/timeseriesflattener/processors/static_test.py b/src/timeseriesflattener/processors/static_test.py index 95a13a2f..bd7cfe9f 100644 --- a/src/timeseriesflattener/processors/static_test.py +++ b/src/timeseriesflattener/processors/static_test.py @@ -37,7 +37,7 @@ def test_process_static_spec(): """ ) - assert_frame_equal(result, expected) + assert_frame_equal(result.df, expected) def test_process_static_spec_multiple_values(): @@ -65,4 +65,4 @@ def test_process_static_spec_multiple_values(): 2-2021-01-01 00:00:00.000000,c,d """ ) - assert_frame_equal(result, expected) + assert_frame_equal(result.df, expected) diff --git a/src/timeseriesflattener/processors/temporal.py b/src/timeseriesflattener/processors/temporal.py index 8c8c2c6d..8624037a 100644 --- a/src/timeseriesflattener/processors/temporal.py +++ b/src/timeseriesflattener/processors/temporal.py @@ -208,12 +208,10 @@ def _create_step_frames( entity_id_col_name=predictiontime_frame.entity_id_col_name, timestamp_col_name=predictiontime_frame.timestamp_col_name, prediction_time_uuid_col_name=predictiontime_frame.prediction_time_uuid_col_name, - coerce_to_lazy=False, ), ValueFrame( init_df=step_value_df, entity_id_col_name=vf.entity_id_col_name, value_timestamp_col_name=vf.value_timestamp_col_name, - coerce_to_lazy=False, ) diff --git a/src/timeseriesflattener/processors/temporal_test.py b/src/timeseriesflattener/processors/temporal_test.py index f58bc039..340e0121 100644 --- a/src/timeseriesflattener/processors/temporal_test.py +++ b/src/timeseriesflattener/processors/temporal_test.py @@ -172,7 +172,7 @@ def test_slice_without_any_within_window(): from polars.testing import assert_series_equal assert_series_equal( - result.get_column("pred_is_null_within_0_to_2_days"), + result.df.get_column("pred_is_null_within_0_to_2_days"), timedelta_frame.df.get_column("is_null"), check_names=False, check_dtypes=False, @@ -261,7 +261,7 @@ def test_process_time_from_event_spec(): """ ) - assert_frame_equal(result, expected) + assert_frame_equal(result.df, expected) def test_process_temporal_spec_multiple_values(): @@ -288,7 +288,7 @@ def test_process_temporal_spec_multiple_values(): """prediction_time_uuid,pred_value_1_within_0_to_1_days_mean_fallback_0,pred_value_2_within_0_to_1_days_mean_fallback_0 1-2021-01-01 00:00:00.000000,1,2""" ) - assert_frame_equal(result, expected) + assert_frame_equal(result.df, expected) def test_sliding_window(): @@ -339,4 +339,4 @@ def test_sliding_window(): 1-2022-01-01 00:00:00.000000,0.0,11.5""" ) - assert_frame_equal(result, expected) + assert_frame_equal(result.df, expected) diff --git a/src/timeseriesflattener/specs/timedelta.py b/src/timeseriesflattener/specs/timedelta.py index 9d7b74af..21459f54 100644 --- a/src/timeseriesflattener/specs/timedelta.py +++ b/src/timeseriesflattener/specs/timedelta.py @@ -34,7 +34,7 @@ class TimeDeltaSpec: def __post_init__(self): validate_col_name_columns_exist(obj=self) max_values_per_id = ( - self.init_frame.get_column(self.init_frame.entity_id_col_name).unique_counts().max() + self.init_frame.df.get_column(self.init_frame.entity_id_col_name).unique_counts().max() ) if max_values_per_id > 1: # type: ignore raise ValueError( From eab89610b15a3ecc25780834fb46ec90318a94db Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Jun 2024 13:10:17 +0200 Subject: [PATCH 3/3] docs: update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 19d445a6..136aa717 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ result = Flattener( init_df=prediction_times_df, entity_id_col_name="id", timestamp_col_name="date" ) ).aggregate_timeseries(specs=[predictor_spec, outcome_spec]) -result +result.df ```