From 520176f25df0d5af62cc370a23d7546f3d7cbc31 Mon Sep 17 00:00:00 2001 From: Hussein Mahfouz <45176416+Hussein-Mahfouz@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:30:04 +0000 Subject: [PATCH 1/2] use spc python package to prepare synthetic pop --- notebooks/dummy_notebook.ipynb | 18 - notebooks/synthpop.ipynb | 347 +++++++ poetry.lock | 1679 +++++++++++++++++++++++++++++++- pyproject.toml | 1 + 4 files changed, 2021 insertions(+), 24 deletions(-) delete mode 100644 notebooks/dummy_notebook.ipynb create mode 100644 notebooks/synthpop.ipynb diff --git a/notebooks/dummy_notebook.ipynb b/notebooks/dummy_notebook.ipynb deleted file mode 100644 index 709d82c..0000000 --- a/notebooks/dummy_notebook.ipynb +++ /dev/null @@ -1,18 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/synthpop.ipynb b/notebooks/synthpop.ipynb new file mode 100644 index 0000000..9beaf00 --- /dev/null +++ b/notebooks/synthpop.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#import polars as pl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use the spc package for our synthetic population. To add it as a dependancy in this virtual environment, I ran `poetry add git+https://github.com/alan-turing-institute/uatk-spc.git@55-output-formats-python#subdirectory=python`. The branch may change if the python package is merged into the main spc branch. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/examples/spc_builder_example.ipynb\n", + "from uatk_spc.builder import Builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading in the SPC synthetic population\n", + "\n", + "I use the code in the `Quickstart` [here](https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/README.md) to get a parquet file and convert it to JSON. \n", + "\n", + "You have two options:\n", + "\n", + "\n", + "1- Slow and memory-hungry: Download the pbf file directly from [here](https://alan-turing-institute.github.io/uatk-spc/using_england_outputs.html) and load in the pbf file with the python package\n", + "\n", + "2- Faster: Covert the pbf file to parquet, and then load it using the python package. To convert to parquet, you need to:\n", + "\n", + "a. clone the [uatk-spc](https://github.com/alan-turing-institute/uatk-spc/tree/main/docs) \n", + " \n", + "b. Run `cargo run --release -- --rng-seed 0 --flat-output config/England/west-yorkshire.txt --year 2020` and replace `west-yorkshire` and `2020` with your preferred option\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Pick a region with SPC output saved\n", + "path = \"../data/spc_output/raw/\"\n", + "region = \"west-yorkshire\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### People and household data" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
id | household | workplace | location | orig_pid | id_tus_hh | id_tus_p | pid_hs | demographics | sic1d2007 | sic2d2007 | soc2010 | pwkstat | salary_yearly | salary_hourly | bmi | has_cardiovascular_disease | has_diabetes | has_high_blood_pressure | number_medications | self_assessed_health | life_satisfaction | events | weekday_diaries | weekend_diaries | msoa | oa | members | hid | nssec8 | accommodation_type | communal_type | num_rooms | central_heat | tenure | num_cars |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
u64 | u64 | u64 | struct[2] | str | i64 | i64 | i64 | struct[4] | str | u64 | u64 | i32 | f32 | f32 | f32 | bool | bool | bool | u64 | i32 | i32 | struct[7] | list[u64] | list[u64] | str | str | list[u64] | str | i32 | i32 | i32 | u64 | bool | i32 | u64 |
0 | 0 | null | {-1.789218,53.919151} | "E02002183_0001… | 11291218 | 1 | 2905399 | {1,86,1,1} | "J" | 58 | 1115 | 6 | null | null | 24.879356 | false | false | false | null | 3 | 2 | {0.09,0.1134,2.9846e-31,1.2791e-31,0.000881,0.000377,0.10494} | [1583, 13161] | [1582, 13160] | "E02002183" | "E00053954" | [0] | "E02002183_0001… | 1 | 1 | null | 2 | true | 2 | 2 |
1 | 1 | null | {-1.826238,53.92028} | "E02002183_0002… | 17291219 | 1 | 2905308 | {1,74,3,1} | "C" | 25 | 1121 | 6 | null | null | 27.491207 | false | false | true | null | 3 | null | {0.239,0.30114,2.2734e-20,9.7432e-21,0.051032,0.021871,0.13662} | [2900, 4948, … 15793] | [2901, 4949, … 15792] | "E02002183" | "E00053953" | [1, 2] | "E02002183_0002… | 1 | 3 | null | 6 | true | 2 | 2 |
2 | 1 | null | {-1.826238,53.92028} | "E02002183_0002… | 17070713 | 2 | 2907681 | {2,68,1,2} | "P" | 85 | 2311 | 6 | null | null | 17.310829 | false | true | true | null | 2 | 4 | {0.239,0.17686,3.6288e-16,8.4672e-16,0.098134,0.228979,0.15741} | [3010, 6389, … 11598] | [3011, 6388, … 11599] | "E02002183" | "E00053953" | [1, 2] | "E02002183_0002… | 1 | 3 | null | 6 | true | 2 | 2 |
3 | 2 | 56126 | {-1.874994,53.942989} | "E02002183_0003… | 20310313 | 1 | 2902817 | {1,27,1,4} | "C" | 31 | 3422 | 1 | 32857.859375 | 14.360952 | 20.852091 | false | false | false | null | 2 | 1 | {0.233,0.14679,4.397019,1.884437,0.522664,0.223999,0.15741} | [366, 867, … 14534] | [365, 868, … 14533] | "E02002183" | "E00053689" | [3, 4] | "E02002183_0003… | 4 | 3 | null | 6 | true | 2 | 1 |
4 | 2 | null | {-1.874994,53.942989} | "E02002183_0003… | 13010909 | 3 | 2900884 | {2,26,1,6} | "J" | 62 | 7214 | 1 | 18162.451172 | 9.439944 | 20.032526 | false | false | false | 1 | 2 | 3 | {0.233,0.08621,2.090329,4.877435,0.18608,0.434187,0.15741} | [1289, 12528, 12870] | [1288, 12529, 12871] | "E02002183" | "E00053689" | [3, 4] | "E02002183_0003… | 4 | 3 | null | 6 | true | 2 | 1 |
health |
---|
struct[7] |
{24.879356,false,false,false,null,3,2} |
{27.491207,false,false,true,null,3,null} |
{17.310829,false,true,true,null,2,4} |
{20.852091,false,false,false,null,2,1} |
{20.032526,false,false,false,1,2,3} |
{29.106817,false,false,true,null,1,3} |
{25.621599,false,false,false,3,3,3} |
{33.893459,true,false,true,3,1,3} |
{null,false,false,false,null,1,null} |
{24.492905,false,false,false,null,4,2} |
{31.561234,true,false,true,4,2,4} |
{28.171663,false,true,true,null,3,3} |
… |
{22.046501,false,false,false,2,1,3} |
{14.627893,false,false,false,1,1,1} |
{25.986469,false,false,false,0,1,null} |
{23.44569,false,false,false,1,3,1} |
{26.506229,false,false,true,null,3,3} |
{25.481789,false,false,false,null,3,2} |
{14.997225,false,false,false,2,3,2} |
{22.199043,false,false,false,0,2,2} |
{23.534786,false,false,false,null,3,2} |
{18.523956,true,false,true,7,4,4} |
{28.988529,false,false,false,null,1,3} |
{18.38345,false,false,false,1,1,3} |
health |
---|
struct[7] |
{24.879356,false,false,false,null,3,2} |
{27.491207,false,false,true,null,3,null} |
{17.310829,false,true,true,null,2,4} |
{20.852091,false,false,false,null,2,1} |
{20.032526,false,false,false,1,2,3} |
{29.106817,false,false,true,null,1,3} |
{25.621599,false,false,false,3,3,3} |
{33.893459,true,false,true,3,1,3} |
{null,false,false,false,null,1,null} |
{24.492905,false,false,false,null,4,2} |
{31.561234,true,false,true,4,2,4} |
{28.171663,false,true,true,null,3,3} |
… |
{22.046501,false,false,false,2,1,3} |
{14.627893,false,false,false,1,1,1} |
{25.986469,false,false,false,0,1,null} |
{23.44569,false,false,false,1,3,1} |
{26.506229,false,false,true,null,3,3} |
{25.481789,false,false,false,null,3,2} |
{14.997225,false,false,false,2,3,2} |
{22.199043,false,false,false,0,2,2} |
{23.534786,false,false,false,null,3,2} |
{18.523956,true,false,true,7,4,4} |
{28.988529,false,false,false,null,1,3} |
{18.38345,false,false,false,1,1,3} |