From 6c1d6cad21031a65171ac9f8339a444de294d42b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Feb 2024 13:35:57 +0100 Subject: [PATCH] docs(#475): delete 02_advanced.ipynb Fixes #475 --- docs/tutorials/02_advanced.ipynb | 549 ------------------------------- 1 file changed, 549 deletions(-) delete mode 100644 docs/tutorials/02_advanced.ipynb diff --git a/docs/tutorials/02_advanced.ipynb b/docs/tutorials/02_advanced.ipynb deleted file mode 100644 index 7ab81576..00000000 --- a/docs/tutorials/02_advanced.ipynb +++ /dev/null @@ -1,549 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Advanced Tutorial" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the basic tutorial we covered how to add static features, predictors and outcomes.\n", - "In this tutorial, we'll expand on that, covering how to effectively add many features by:\n", - "1. Creating feature combinations from specifications,\n", - "2. Using caching, so you can iterate on your datasets without having to complete full computations every time\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating feature combinations\n", - "Manually specifying a handful of features one at a time is rather straightforward, but what if you want to generate hundreds of features? Or want to have multiple different lookbehind windows, e.g. a month, 6 months and a year? Then the amount of code you'll have to write will grow quite substantially and becomes time consuming and hard to navigate.\n", - "\n", - "To solve this problem, we implemented feature group specifications. They allow you to combinatorially create features. Let's look at an example:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "from pprint import pprint as pprint\n", - "\n", - "import numpy as np\n", - "from timeseriesflattener.aggregation_fns import maximum, mean\n", - "from timeseriesflattener.feature_specs.group_specs import NamedDataframe, PredictorGroupSpec\n", - "from timeseriesflattener.testing.load_synth_data import load_synth_predictor_float" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "pred_spec_batch = PredictorGroupSpec(\n", - " named_dataframes=[\n", - " NamedDataframe(df=load_synth_predictor_float(), name=\"synth_predictor_float\")\n", - " ],\n", - " lookbehind_days=[(0, 365), (365, 730), 1095],\n", - " fallback=[np.nan],\n", - " aggregation_fns=[mean, maximum],\n", - ").create_combinations()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You'll note that:\n", - "\n", - "1. All attributes are now required to be lists. This makes iteration easier when creating the combinations.\n", - "2. We require a named_dataframes sequence. A namedataframe is exactly that; a dataframe and a name. This is used when we create the features in the output, e.g. for a predictor, the output feature using load_synth_predictor_flaot will be called pred_synth_predictor_float_ because that's the name attributed in the NamedDataframe.\n", - "\n", - "Let's check that the results look good." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "––––––––– We created 6 combinations of predictors. ––––––––––\n", - "[{'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", - " {'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", - " {'aggregation_fn': 'mean',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)},\n", - " {'aggregation_fn': 'maximum',\n", - " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)}]\n" - ] - } - ], - "source": [ - "# Create a small summary to highlight the generated predictors\n", - "pred_spec_batch_summary = [\n", - " {\n", - " \"feature_name\": pred_spec.feature_base_name,\n", - " \"lookbehind_days\": pred_spec.lookbehind_period,\n", - " \"aggregation_fn\": pred_spec.aggregation_fn.__name__,\n", - " }\n", - " for pred_spec in pred_spec_batch\n", - "]\n", - "print(f\"––––––––– We created {len(pred_spec_batch)} combinations of predictors. ––––––––––\")\n", - "pprint(pred_spec_batch_summary)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we know how to create a bunch of feature specifications quickly! But with more features comes more computation. Let's look at caching next, so we can iterate on our datasets more quickly." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Caching" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Timeseriesflattener ships with a class that allows for caching to disk. Let's look at an example of that:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "from skimpy import skim\n", - "from timeseriesflattener.feature_cache.cache_to_disk import DiskCache\n", - "from timeseriesflattener.flattened_dataset import TimeseriesFlattener\n", - "from timeseriesflattener.testing.load_synth_data import load_synth_prediction_times" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-01-18 11:38:02 [INFO] Overriding pred_time_uuid_col_name in cache with pred_time_uuid_col_name passed to init of flattened dataset\n" - ] - } - ], - "source": [ - "ts_flattener = TimeseriesFlattener(\n", - " prediction_times_df=load_synth_prediction_times(),\n", - " entity_id_col_name=\"entity_id\",\n", - " timestamp_col_name=\"timestamp\",\n", - " n_workers=4,\n", - " cache=DiskCache(feature_cache_dir=Path(\".tmp\") / \"feature_cache\"),\n", - " drop_pred_times_with_insufficient_look_distance=True,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All we need to specify is that we use the DiskCache class, and which directory to save the feature cache to.\n", - "\n", - "The first time we create features, this will just save them to disk and won't make any difference to performance. But say we want to add two more features - then it'll load the features that it has already computed from disk, and then only compute the two new features.\n", - "\n", - "Note that DiskCache is an instance of the abstract class FeatureCache. If you want to implement your own cache, for example using REDIS or SQL, all you'll need is to implement the 3 methods in that class. Now, let's compute a dataframe to check that everything works." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "ts_flattener.add_spec(pred_spec_batch)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-01-18 11:38:03 [INFO] There were unprocessed specs, computing...\n", - "2024-01-18 11:38:03 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 6053 (60.53%) rows\n", - "2024-01-18 11:38:03 [INFO] Processing 6 temporal features in parallel with 4 workers. Chunksize is 2. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", - " 0%| | 0/6 [00:00╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", - "│ Data Summary Data Types │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", - "│ ┃ dataframe Values ┃ ┃ Column Type Count ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", - "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", - "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", - "│ │ string │ 1 │ │\n", - "│ └─────────────┴───────┘ │\n", - "│ number │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", - "│ │ entity_id 0 0 5000 2900 0 2600 7400 10000█████▇ │ │\n", - "│ │ pred_synth_predictor 7 0.18 5 1.3 0.29 4.1 5.8 9.9 ▂█▇▁ │ │\n", - "│ │ pred_synth_predictor 510 13 6.6 2.6 0.024 4.8 8.8 10▂▂▃▄▆█ │ │\n", - "│ │ pred_synth_predictor 530 14 6.6 2.6 0.0084 4.8 8.8 10▁▂▃▄▆█ │ │\n", - "│ │ pred_synth_predictor 7 0.18 8.4 1.5 0.29 7.8 9.5 10 ▁▃█ │ │\n", - "│ │ pred_synth_predictor 510 13 5.1 2.2 0.024 3.6 6.5 10▂▄██▅▂ │ │\n", - "│ │ pred_synth_predictor 530 14 5 2.1 0.0084 3.6 6.4 9.9▂▄██▄▂ │ │\n", - "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", - "│ datetime │\n", - "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % first last frequency ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ timestamp 0 0 1968-01-02 05:12:00 1969-12-31 21:42:00 None │ │\n", - "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", - "│ string │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % words per row total words ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ prediction_time_uuid 0 0 1 3900 │ │\n", - "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", - "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n", - "\n" - ], - "text/plain": [ - "╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮\n", - "│ \u001b[3m Data Summary \u001b[0m \u001b[3m Data Types \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", - "│ ┃\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0m┃ ┃\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", - "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", - "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", - "│ │ string │ 1 │ │\n", - "│ └─────────────┴───────┘ │\n", - "│ \u001b[3m number \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7400\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.3\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 4.1\u001b[0m │ \u001b[36m 5.8\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m ▂█▇▁ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▂▃▄▆█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▁▂▃▄▆█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 8.4\u001b[0m │ \u001b[36m 1.5\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 7.8\u001b[0m │ \u001b[36m 9.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m ▁▃█\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 5.1\u001b[0m │ \u001b[36m 2.2\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▄██▅▂\u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.4\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▂▄██▄▂\u001b[0m │ │\n", - "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", - "│ \u001b[3m datetime \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mtimestamp \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[31m 1968-01-02 05:12:00 \u001b[0m │ \u001b[31m 1969-12-31 21:42:00 \u001b[0m │ \u001b[38;5;141mNone \u001b[0m │ │\n", - "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", - "│ \u001b[3m string \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mprediction_time_uuid \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 3900\u001b[0m │ │\n", - "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", - "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "['entity_id',\n", - " 'timestamp',\n", - " 'prediction_time_uuid',\n", - " 'pred_synth_predictor_float_within_0_to_1095_days_mean_fallback_nan',\n", - " 'pred_synth_predictor_float_within_365_to_730_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_365_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_1095_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_365_to_730_days_mean_fallback_nan',\n", - " 'pred_synth_predictor_float_within_0_to_365_days_mean_fallback_nan']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "skim(df)\n", - "\n", - "list(df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 entity_idtimestampprediction_time_uuidpred_1pred_2pred_3pred_4pred_5pred_6
099031968-05-09 21:24:009903-1968-05-09-21-24-002.8646262.1943190.1549815.9315531.4086550.154981
149271968-06-30 12:13:004927-1968-06-30-12-13-004.466599nan6.7306948.630901nan4.957251
231571969-10-07 05:01:003157-1969-10-07-05-01-004.168456nan5.2431765.243176nan5.068323
397931968-12-15 12:59:009793-1968-12-15-12-59-007.1449598.2932669.7089769.7271826.2304178.091755
498611969-01-22 17:34:009861-1969-01-22-17-34-003.6696355.4914153.1302836.2171613.3091973.130283
56571969-04-14 15:47:00657-1969-04-14-15-47-007.3915147.903614nan7.9036147.903614nan
679161968-12-20 03:38:007916-1968-12-20-03-38-004.2517046.0845234.3185866.9791566.0845233.901992
728831968-01-28 21:50:002883-1968-01-28-21-50-004.712403nan8.2577428.257742nan8.257742
815151968-07-18 08:28:001515-1968-07-18-08-28-003.1127003.6846148.6548398.6548393.1046742.907289
967541968-09-21 01:27:006754-1968-09-21-01-27-005.0829183.1021322.3466449.6577552.3249132.346644
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# For displayability, shorten col names\n", - "pred_cols = [c for c in df.columns if c.startswith(\"pred_\")]\n", - "rename_dict = {c: f\"pred_{i+1}\" for i, c in enumerate(pred_cols)}\n", - "df_renamed = df.rename(rename_dict, axis=1)\n", - "\n", - "# Print a dataframe\n", - "base_cols = [\"entity_id\", \"timestamp\", \"prediction_time_uuid\"]\n", - "renamed_cols = list(rename_dict.values())\n", - "\n", - "df_renamed[0:10][base_cols + renamed_cols].style.set_table_attributes('style=\"font-size: 14px\"')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.7 ('.venv': poetry)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "d2b49c0af2d95979144de75823f7cfbb268839811992fdd0cb17fc1bb54ce815" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}