From b5a385ea92883cb6c9a675517b322703a7cabfb4 Mon Sep 17 00:00:00 2001
From: Fedor Baart <fedor.baart@deltares.nl>
Date: Thu, 25 Apr 2024 16:08:28 +0200
Subject: [PATCH 1/2] add ivs merge scripts

---
 notebooks/ivs/merge-ivs.ipynb | 201 ++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 notebooks/ivs/merge-ivs.ipynb

diff --git a/notebooks/ivs/merge-ivs.ipynb b/notebooks/ivs/merge-ivs.ipynb
new file mode 100644
index 0000000..2a62a18
--- /dev/null
+++ b/notebooks/ivs/merge-ivs.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f93b6e98-e0cb-4303-9ede-7e159ee55f31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use pathlib to scan for files\n",
+    "import pathlib\n",
+    "\n",
+    "# use polars to read csv files faster\n",
+    "import polars as pl \n",
+    "# progress bar\n",
+    "import tqdm.auto as tqdm\n",
+    "# geopandas for coordinates\n",
+    "import geopandas as gpd\n",
+    "# dask for reading multiple files in parallel\n",
+    "import dask.dataframe as dd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11469369-f679-4136-add4-90555960a07f",
+   "metadata": {},
+   "source": [
+    "# Combine all IVS data files into one dataset\n",
+    "This notebook reads data from [goederenvervoer](https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/) IVS data and stores it as one files for easier processing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "8e6c545f-adb4-43f2-821e-1ae06dd526ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download / update the files with the following command\n",
+    "# wget -c -nd -r -np -l 1 -A zip 'https://downloads.rijkswaterstaatdata.nl/scheepvaart/goederenvervoer/archief/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1a1f1c60-fcd2-45bf-afc7-3acb25f270d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define all column names \n",
+    "schema = dict([\n",
+    "    ('Jaarmaand', pl.Int64),\n",
+    "    ('Jaar', pl.Int64),\n",
+    "    ('Maand', pl.Int64),\n",
+    "    ('Weeknr', pl.Int64),\n",
+    "    ('v05_06_begindt_evenement_iso', pl.String),\n",
+    "    ('v05_06_Begindt_evenement', pl.String),\n",
+    "    ('UNLO_herkomst', pl.String),\n",
+    "    ('UNLO_bestemming', pl.String),\n",
+    "    ('v15_1_Scheepstype_RWS', pl.String),\n",
+    "    ('SK_CODE', pl.String),\n",
+    "    ('v18_Laadvermogen', pl.Float64),\n",
+    "    ('v28_Beladingscode', pl.Int64), \n",
+    "    ('v38_Vervoerd_gewicht', pl.Int64), # check units in kg?\n",
+    "    ('v30_4_Containers_TEU_S', pl.Int64),\n",
+    "    ('nstr_nw', pl.String), # categories: see https://www.cbs.nl/en-gb/our-services/methods/definitions/commodity-nomenclature-nstr\n",
+    "    ('nst2007_nw', pl.String)\n",
+    "])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7c08e450-ce3d-401f-bc5a-1119940ceaea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pl.read_csv('./IVS_weekmonitor_01APR2021.csv', separator=';', quote_char='\"', schema=schema, ignore_errors=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "563f0680-5e5c-4155-8681-989854597ed5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir = pathlib.Path('.')\n",
+    "paths = list(data_dir.glob('*.csv'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "956efc78-1da6-4c1f-8898-568cdd4eaffa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "65ffe5ef5deb4be29b15d09e383431a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1079 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for path in tqdm.tqdm(paths):\n",
+    "    df = pl.read_csv(path, separator=';', quote_char='\"', schema=schema, ignore_errors=True)\n",
+    "    df.write_parquet(path.with_suffix('.parquet'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "043220af-12aa-4dde-90a4-a815d93f04aa",
+   "metadata": {},
+   "source": [
+    "Now we can re-read all the singular files. They should now all have static column types. We'll convert it into one file using dask. Dask can read multiple files and treat them as one. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "81a70d38-8411-4260-922e-4a2061efe566",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ddf = dd.read_parquet('*.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "76a4f3f3-18d2-42be-8b89-dd6666ba2f66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop all double records (this might need some \n",
+    "ddf = ddf.drop_duplicates()\n",
+    "ddf.to_parquet('ivs-2024.parquet', overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ae885b79-fbb2-478e-9b44-ade2f23ec69b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert to one file (to read into memory)\n",
+    "ddf.compute().to_parquet('ivs-2024-one-file.parquet')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "191a15fb-eac4-4857-88be-04c4cd1ebc02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test read performance\n",
+    "ivs_df = pl.read_parquet('ivs-2024-one-file.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77fdb721-6bc9-4d42-84a2-1881893948cf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 48ce0324bbfa805f01ebc97a0c0104d25704a17a Mon Sep 17 00:00:00 2001
From: Fedor Baart <fedor.baart@deltares.nl>
Date: Mon, 29 Apr 2024 16:42:00 +0200
Subject: [PATCH 2/2] validation of FIS details on classification

---
 .../validation/cemt-network-validation.ipynb  | 203 ++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 notebooks/fis-network/validation/cemt-network-validation.ipynb

diff --git a/notebooks/fis-network/validation/cemt-network-validation.ipynb b/notebooks/fis-network/validation/cemt-network-validation.ipynb
new file mode 100644
index 0000000..defd72f
--- /dev/null
+++ b/notebooks/fis-network/validation/cemt-network-validation.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ba4fce13-543f-467d-8c64-79b7fe1d5a46",
+   "metadata": {},
+   "source": [
+    "# Check if CEMT codes are correct\n",
+    "This notebook describes how well the Vaarweg codes are filled in. They are consistently filled in, but only in NL and part of BE. Germany is missing. We also show how to compute a consistent ordinal variable.\n",
+    "\n",
+    "This information is based on Vaarweginformatie: https://www.vaarweginformatie.nl/frp/main/#/home\n",
+    "\n",
+    "You can get extra info from the EURIS API:\n",
+    "See for example https://developer.eurisportal.eu/docs/reference-data/locks\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "507a5c44-a43b-4bf3-a0e2-730f6b4c5f8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "import networkx as nx\n",
+    "import requests\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d9a6a36d-a55b-41ee-9e22-95c3293afedf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fis_url = 'https://zenodo.org/records/6673604/files/network_digital_twin_v0.3.pickle?download=1'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "38a4e34c-760f-47b1-8217-fa6757d91be2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "resp = requests.get(fis_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d28b2422-d698-4af5-96da-c6ec8dfd28ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = pickle.loads(resp.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "15ce72a0-25ab-486e-8560-24a27318b6ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "edges_df = nx.to_pandas_edgelist(graph)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "426d5170-2746-4b6b-a42e-0d31ac0fcb06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# store nan filled codes under CEMT\n",
+    "\n",
+    "\n",
+    "def code2cemt(code):\n",
+    "    \"\"\"convert the almost CEMT like code stored in vaarweginformatie\"\"\"\n",
+    "    # Somehow the CEMT class _0 (which is not in the Richtlijn Vaarwegen, but used to be in older versions. We'll replace it by '0'\n",
+    "\n",
+    "    \n",
+    "    cemt = ''\n",
+    "    if pd.isna(code):\n",
+    "        cemt = ''\n",
+    "    elif code == '_0':\n",
+    "        # special class defined in richtlijn vaarwegen 2011\n",
+    "        cemt = '0' \n",
+    "    else:\n",
+    "        cemt = code\n",
+    "    return cemt\n",
+    "    \n",
+    "# We need to add the empty category, otherwise\n",
+    "# add all categories from the richtlijnvaarwegen and add VIIa (not used in NL\n",
+    "categories = ['0', 'I', 'II', 'III', 'IV', 'IVa', 'IVb', 'V_A', 'V_B', 'VI_A', 'VI_B', 'VI_C', 'VII', 'VIIa']\n",
+    "cemt_dtype = pd.CategoricalDtype(categories=categories, ordered=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8599c3f0-5234-408e-90a9-6945a71e71d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# you can use the above function to create a elegant categorical function, including ordening\n",
+    "# Note that when using the data in the graph the ordening function does not work.\n",
+    "# on a future update we should make the codes uniform and rename them from Code to CEMT\n",
+    "edges_df['CEMT'] = pd.Categorical(edges_df['Code'].apply(code2cemt), dtype=cemt_dtype)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3af3af32-f284-4d38-b069-745ca81fa619",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert CEMT to M2, with multiple we will select the middle M or upper M class, when no M class exists, pick RWS class from barges. \n",
+    "\n",
+    "cemt2rws_dict = {\n",
+    "    \"0\": \"M0\",\n",
+    "    \"I\": \"M1\",\n",
+    "    \"II\": \"M2\", \n",
+    "    \"III\": \"M4\",  \n",
+    "    \"IV\": \"M7\",\n",
+    "    \"V_A\": \"M\", \n",
+    "    \"V_B\": \"BII-2l\",\n",
+    "    \"VI_A\": \"M11\",\n",
+    "    \"VI_B\": \"BII-4\",\n",
+    "    \"VI_C\": \"BII-6l\"\n",
+    "}\n",
+    "def cemt2rws(cemt):\n",
+    "    return cemt2rws_dict.get(cemt, '')\n",
+    "\n",
+    "\n",
+    "categories = ['M0', 'M1', 'BO1', 'M2', 'BO2', 'M3', 'BO3', 'M4', 'BO4', 'M5', 'M6', 'BI', 'M7', 'M8', 'BII-1', 'M9', 'BIIa-1', 'BIIL-1', 'BII-2l', 'M10', 'BII-2b', 'M11', 'M12',  'BII-4', 'BII-6l', 'BII-6b'] \n",
+    "rws_dtype = pd.CategoricalDtype(categories=categories, ordered=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "00a0129b-02b2-483c-8381-0fac9e89906a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "edges_df['rws_richtlijn_vaarwegen'] = pd.Categorical(edges_df['CEMT'].apply(cemt2rws), dtype=rws_dtype)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "5da3ed64-351a-4deb-9c9d-eeb5b6d1d259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# update the graph with our new classification info \n",
+    "for e, info in  graph.edges.items():\n",
+    "    cemt = code2cemt(info['Code'])\n",
+    "    rws = cemt2rws(cemt)\n",
+    "    info['CEMT'] = cemt\n",
+    "    info['rws_richtlijn_vaarwegen'] = rws"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88d8074e-541d-4a6b-9bb3-53bffc0f23bd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}