diff --git a/cmf/data/results.py b/cmf/data/results.py index dad2e7c..3f1a25c 100644 --- a/cmf/data/results.py +++ b/cmf/data/results.py @@ -203,8 +203,11 @@ def inspect_with_source( """Enriches the results with the source data.""" df = ( self.to_df() - .filter(["left_id", "right_id"]) - .map(str) + .filter(["left_id", "right_id", "probability"]) + .assign( + left_id=lambda d: d.left_id.apply(str), + right_id=lambda d: d.right_id.apply(str), + ) .merge( left_data.assign(**{left_key: lambda d: d[left_key].apply(str)}), how="left", diff --git a/notebooks/engineering/WL_query.ipynb b/notebooks/engineering/WL_query.ipynb index 0b1d7bb..5fb2995 100644 --- a/notebooks/engineering/WL_query.ipynb +++ b/notebooks/engineering/WL_query.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "id": "6c751528-6238-4f29-a9e4-79bf167d8308", "metadata": {}, "outputs": [], @@ -29,10 +29,12 @@ "# import connectorx as cx\n", "from pandas import DataFrame\n", "import pandas as pd\n", - "from typing import Optional\n", + "from typing import Optional, Dict, List\n", "from sqlglot import parse_one\n", + "import time\n", + "from datetime import timedelta\n", "\n", - "from sqlalchemy import select\n", + "from sqlalchemy import select, Engine\n", "from sqlalchemy.dialects import postgresql\n", "\n", "import cmf\n", @@ -69,11 +71,263 @@ "source": [ "# Speeding up queries\n", "\n", - "Everything is slower than I thought. Let's profile it.\n", + "Everything is slower than I thought. Let's profile and optimise.\n", + "\n", + "Let's compile the SQL for three tables so we've got points to compare and contrast." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# This is just the relevant innards of query()\n", + "\n", + "def compile_query_to_postgresql(\n", + " selector: Dict[str, List[str]],\n", + " model: str,\n", + " engine: Engine = ENGINE,\n", + ") -> str:\n", + " parent, child = _parent_to_tree(model, engine=engine)\n", + "\n", + " if len(parent) == 0:\n", + " raise ValueError(f\"Model {model} not found\")\n", + "\n", + " tree = [parent] + child\n", + " reachable_stmt = _tree_to_reachable_stmt(tree)\n", + " lookup_stmt = _reachable_to_parent_data_stmt(reachable_stmt, parent)\n", + " data_stmt = _selector_to_data(selector, engine=engine).cte()\n", + "\n", + " final_stmt = select(lookup_stmt.c.parent.label(\"cluster_sha1\"), data_stmt).join(\n", + " lookup_stmt, lookup_stmt.c.child == data_stmt.c.data_sha1\n", + " )\n", + "\n", + " with ENGINE.connect() as conn:\n", + " cursor = conn.connection.cursor()\n", + " compiled = final_stmt.compile(\n", + " dialect=postgresql.dialect(),\n", + " compile_kwargs={\"render_postcompile\": True}\n", + " )\n", + " compiled_bound = cursor.mogrify(str(compiled), compiled.params)\n", + " sql = parse_one(compiled_bound.decode(\"utf-8\"))\n", + "\n", + " return sql.sql(dialect=\"postgres\", pretty=True)" + ] + }, + { + "source": [ + "## 🔴 Data Hub companies\n", + "\n", + "\n", + "Weirdly slow for 500k records. Times out.\n", + "\n", + "...and now doesn't?! This was proper breaking last week!" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "WITH RECURSIVE source_data_unnested AS (\n SELECT\n _team_cmf.cmf__source_data.sha1 AS sha1,\n UNNEST(_team_cmf.cmf__source_data.id) AS id,\n _team_cmf.cmf__source_data.dataset AS dataset\n FROM _team_cmf.cmf__source_data\n), anon_1 AS (\n SELECT\n source_data_unnested.sha1 AS data_sha1,\n dbt.data_hub__companies.name AS dbt_data_hub__companies_name,\n dbt.data_hub__companies.company_number AS dbt_data_hub__companies_company_number,\n dbt.data_hub__companies.address_postcode AS dbt_data_hub__companies_address_postcode\n FROM source_data_unnested\n LEFT OUTER JOIN dbt.data_hub__companies\n ON source_data_unnested.id = CAST(dbt.data_hub__companies.id AS VARCHAR)\n AND source_data_unnested.dataset = CAST(CAST('60f65644-8990-4fcc-b0c3-555cbd284b7d' AS UUID) AS UUID)\n WHERE\n NOT dbt.data_hub__companies.id IS NULL\n), allowed AS (\n SELECT\n _team_cmf.cmf__ddupe_contains.parent AS parent,\n _team_cmf.cmf__ddupe_contains.child AS child\n FROM _team_cmf.cmf__ddupe_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__ddupe_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\xa1b6d0eaf9115726b371548db2f97ee99af64854' AS BYTEA))\n UNION\n SELECT\n _team_cmf.cmf__link_contains.parent AS parent,\n _team_cmf.cmf__link_contains.child AS child\n FROM _team_cmf.cmf__link_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__link_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_2\n ON _team_cmf.cmf__link_contains.child = cmf__clusters_2.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\xa1b6d0eaf9115726b371548db2f97ee99af64854' AS BYTEA))\n), root AS (\n SELECT\n allowed.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN _team_cmf.cmf__clusters\n ON _team_cmf.cmf__clusters.sha1 = allowed.parent\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = _team_cmf.cmf__clusters.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 = CAST('\\xa1b6d0eaf9115726b371548db2f97ee99af64854' AS BYTEA)\n), recurse(parent, child) AS (\n SELECT\n root.parent AS parent,\n root.child AS child\n FROM root\n UNION\n SELECT\n recurse.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN recurse\n ON allowed.parent = recurse.child\n)\nSELECT\n recurse.parent AS cluster_sha1,\n anon_1.data_sha1,\n anon_1.dbt_data_hub__companies_name,\n anon_1.dbt_data_hub__companies_company_number,\n anon_1.dbt_data_hub__companies_address_postcode\nFROM anon_1\nJOIN recurse\n ON recurse.child = anon_1.data_sha1\n" + } + ], + "source": [ + "my_selector = selector(\n", + " table=\"dbt.data_hub__companies\",\n", + " fields=[\"name\", \"company_number\", \"address_postcode\"],\n", + ")\n", + "my_model = \"naive_data_hub_v1\"\n", + "\n", + "compiled = compile_query_to_postgresql(selector=my_selector, model=my_model)\n", + "print(compiled)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Total time: 0:01:45.517325\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": " cluster_sha1 \\\n0 b\"B\\xae\\\\4\\x83\\xe8\\xad#\\x91z'\\xa3\\x0e\\xbb#\\xb1... \n1 b'K\\xfa\\xb4\\xb9\\xac\\xbe\\x8e\\x8c\\xdd\\x12\\x0e\\xd... \n2 b'\\\\\\xe1WW\\x97\\x06$\\x9eV=12h\\xce\\x7f:\\xaa\\x9a\\... \n\n data_sha1 \\\n0 b\"B\\xae\\\\4\\x83\\xe8\\xad#\\x91z'\\xa3\\x0e\\xbb#\\xb1... \n1 b'K\\xfa\\xb4\\xb9\\xac\\xbe\\x8e\\x8c\\xdd\\x12\\x0e\\xd... \n2 b'\\\\\\xe1WW\\x97\\x06$\\x9eV=12h\\xce\\x7f:\\xaa\\x9a\\... \n\n dbt_data_hub__companies_name \\\n0 National Star Centre For Disabled Youth Ltd \n1 HAWKESBURY CONSULTING LIMITED \n2 BIRMINGHAM WOMENS AND CHILDRENS NHS FOUNDATION... \n\n dbt_data_hub__companies_company_number \\\n0 \n1 06736356 \n2 \n\n dbt_data_hub__companies_address_postcode \n0 GL53 9QU \n1 CB24 4UQ \n2 B4 6NH ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cluster_sha1data_sha1dbt_data_hub__companies_namedbt_data_hub__companies_company_numberdbt_data_hub__companies_address_postcode
0b\"B\\xae\\\\4\\x83\\xe8\\xad#\\x91z'\\xa3\\x0e\\xbb#\\xb1...b\"B\\xae\\\\4\\x83\\xe8\\xad#\\x91z'\\xa3\\x0e\\xbb#\\xb1...National Star Centre For Disabled Youth Ltd<NA>GL53 9QU
1b'K\\xfa\\xb4\\xb9\\xac\\xbe\\x8e\\x8c\\xdd\\x12\\x0e\\xd...b'K\\xfa\\xb4\\xb9\\xac\\xbe\\x8e\\x8c\\xdd\\x12\\x0e\\xd...HAWKESBURY CONSULTING LIMITED06736356CB24 4UQ
2b'\\\\\\xe1WW\\x97\\x06$\\x9eV=12h\\xce\\x7f:\\xaa\\x9a\\...b'\\\\\\xe1WW\\x97\\x06$\\x9eV=12h\\xce\\x7f:\\xaa\\x9a\\...BIRMINGHAM WOMENS AND CHILDRENS NHS FOUNDATION...<NA>B4 6NH
\n
" + }, + "metadata": {}, + "execution_count": 19 + } + ], + "source": [ + "start = time.time()\n", + "\n", + "df = cmf.query(\n", + " selector=my_selector, return_type=\"pandas\", model=my_model\n", + ")\n", + "\n", + "elapsed = time.time() - start\n", + "print(f\"Total time: {timedelta(seconds=elapsed)}\")\n", + "df.head(3)" + ] + }, + { + "source": [ + "## 🟡 Export wins\n", + "\n", + "50k records, takes about a minute. Slower than you'd hope and seems to share a query plan with Data Hub, but is small enough it doesn't matter." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "WITH RECURSIVE source_data_unnested AS (\n SELECT\n _team_cmf.cmf__source_data.sha1 AS sha1,\n UNNEST(_team_cmf.cmf__source_data.id) AS id,\n _team_cmf.cmf__source_data.dataset AS dataset\n FROM _team_cmf.cmf__source_data\n), anon_1 AS (\n SELECT\n source_data_unnested.sha1 AS data_sha1,\n dbt.export_wins__wins_dataset.company_name AS dbt_export_wins__wins_dataset_company_name,\n dbt.export_wins__wins_dataset.cdms_reference AS dbt_export_wins__wins_dataset_cdms_reference\n FROM source_data_unnested\n LEFT OUTER JOIN dbt.export_wins__wins_dataset\n ON source_data_unnested.id = CAST(dbt.export_wins__wins_dataset.id AS VARCHAR)\n AND source_data_unnested.dataset = CAST(CAST('cc89099f-d065-49cc-aa45-e08e1db6653a' AS UUID) AS UUID)\n WHERE\n NOT dbt.export_wins__wins_dataset.id IS NULL\n), allowed AS (\n SELECT\n _team_cmf.cmf__ddupe_contains.parent AS parent,\n _team_cmf.cmf__ddupe_contains.child AS child\n FROM _team_cmf.cmf__ddupe_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__ddupe_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\x17a46e0ea365597922a07f6b7bb01b9956807b21' AS BYTEA))\n UNION\n SELECT\n _team_cmf.cmf__link_contains.parent AS parent,\n _team_cmf.cmf__link_contains.child AS child\n FROM _team_cmf.cmf__link_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__link_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_2\n ON _team_cmf.cmf__link_contains.child = cmf__clusters_2.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\x17a46e0ea365597922a07f6b7bb01b9956807b21' AS BYTEA))\n), root AS (\n SELECT\n allowed.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN _team_cmf.cmf__clusters\n ON _team_cmf.cmf__clusters.sha1 = allowed.parent\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = _team_cmf.cmf__clusters.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 = CAST('\\x17a46e0ea365597922a07f6b7bb01b9956807b21' AS BYTEA)\n), recurse(parent, child) AS (\n SELECT\n root.parent AS parent,\n root.child AS child\n FROM root\n UNION\n SELECT\n recurse.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN recurse\n ON allowed.parent = recurse.child\n)\nSELECT\n recurse.parent AS cluster_sha1,\n anon_1.data_sha1,\n anon_1.dbt_export_wins__wins_dataset_company_name,\n anon_1.dbt_export_wins__wins_dataset_cdms_reference\nFROM anon_1\nJOIN recurse\n ON recurse.child = anon_1.data_sha1\n" + } + ], + "source": [ + "my_selector = selector(\n", + " table=\"dbt.export_wins__wins_dataset\",\n", + " fields=[\"company_name\", \"cdms_reference\"],\n", + ")\n", + "my_model = \"naive_export_wins_v1\"\n", + "\n", + "compiled = compile_query_to_postgresql(selector=my_selector, model=my_model)\n", + "print(compiled)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Total time: 0:00:22.835470\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": " cluster_sha1 \\\n0 b'\\x02\\xd3\\xc3\\xfb\\x01KnG\\xc9,\\x07\\xb1\\xc1\\x11... \n1 b'\\x04\\xdfY\\xad\\xadtT\\x1b\\xed\\xfd\\x06w\\xe9J\\xf... \n2 b'\\x06\\xc1S\\xb5p\\x88SZ\\xbcV\\xd0a\\xfbT\\xad\\xd3g... \n\n data_sha1 \\\n0 b'\\x02\\xd3\\xc3\\xfb\\x01KnG\\xc9,\\x07\\xb1\\xc1\\x11... \n1 b'&\\x04\\x9a\\xda~v\\xbeu?F\\xf0\\xfd\\x92\\xa7IP\\xfa... \n2 b'\\x8cV\\xb8[\\xac\\xa6K,]\\xb1\\x96\\xbf\\xfe\\x1a\\x9... \n\n dbt_export_wins__wins_dataset_company_name \\\n0 ETA Green Power Limited \n1 Med-Eq (Europe) Ltd \n2 Silver Lined Horizons Ltd \n\n dbt_export_wins__wins_dataset_cdms_reference \n0 Companies House ref: 12359858 \n1 ORG-10109781 \n2 ORG-10170829 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cluster_sha1data_sha1dbt_export_wins__wins_dataset_company_namedbt_export_wins__wins_dataset_cdms_reference
0b'\\x02\\xd3\\xc3\\xfb\\x01KnG\\xc9,\\x07\\xb1\\xc1\\x11...b'\\x02\\xd3\\xc3\\xfb\\x01KnG\\xc9,\\x07\\xb1\\xc1\\x11...ETA Green Power LimitedCompanies House ref: 12359858
1b'\\x04\\xdfY\\xad\\xadtT\\x1b\\xed\\xfd\\x06w\\xe9J\\xf...b'&\\x04\\x9a\\xda~v\\xbeu?F\\xf0\\xfd\\x92\\xa7IP\\xfa...Med-Eq (Europe) LtdORG-10109781
2b'\\x06\\xc1S\\xb5p\\x88SZ\\xbcV\\xd0a\\xfbT\\xad\\xd3g...b'\\x8cV\\xb8[\\xac\\xa6K,]\\xb1\\x96\\xbf\\xfe\\x1a\\x9...Silver Lined Horizons LtdORG-10170829
\n
" + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "start = time.time()\n", + "\n", + "df = cmf.query(\n", + " selector=my_selector, return_type=\"pandas\", model=my_model\n", + ")\n", "\n", - "When I compiled and ran it was faster. Now when I run in SQLAlchemy it's faster. All I can conclude is that overall database load is what's screwing with this." + "elapsed = time.time() - start\n", + "print(f\"Total time: {timedelta(seconds=elapsed)}\")\n", + "df.head(3)" ] }, + { + "source": [ + "## 🟢 Companies House\n", + "\n", + "5.5m records, takes about 3 minutes. Weirdless fast -- query plan indicates hash joins. Why does this work well" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "WITH RECURSIVE source_data_unnested AS (\n SELECT\n _team_cmf.cmf__source_data.sha1 AS sha1,\n UNNEST(_team_cmf.cmf__source_data.id) AS id,\n _team_cmf.cmf__source_data.dataset AS dataset\n FROM _team_cmf.cmf__source_data\n), anon_1 AS (\n SELECT\n source_data_unnested.sha1 AS data_sha1,\n companieshouse.companies.company_name AS companieshouse_companies_company_name,\n companieshouse.companies.company_number AS companieshouse_companies_company_number,\n companieshouse.companies.postcode AS companieshouse_companies_postcode\n FROM source_data_unnested\n LEFT OUTER JOIN companieshouse.companies\n ON source_data_unnested.id = CAST(companieshouse.companies.id AS VARCHAR)\n AND source_data_unnested.dataset = CAST(CAST('592b69e0-ce95-47a6-9f0a-bcd792f214a4' AS UUID) AS UUID)\n WHERE\n NOT companieshouse.companies.id IS NULL\n), allowed AS (\n SELECT\n _team_cmf.cmf__ddupe_contains.parent AS parent,\n _team_cmf.cmf__ddupe_contains.child AS child\n FROM _team_cmf.cmf__ddupe_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__ddupe_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\x5666a21720152c92b6b89be7d61e336d4ca684bf' AS BYTEA))\n UNION\n SELECT\n _team_cmf.cmf__link_contains.parent AS parent,\n _team_cmf.cmf__link_contains.child AS child\n FROM _team_cmf.cmf__link_contains\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_1\n ON _team_cmf.cmf__link_contains.parent = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__clusters AS cmf__clusters_2\n ON _team_cmf.cmf__link_contains.child = cmf__clusters_2.sha1\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = cmf__clusters_1.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 IN (CAST('\\x5666a21720152c92b6b89be7d61e336d4ca684bf' AS BYTEA))\n), root AS (\n SELECT\n allowed.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN _team_cmf.cmf__clusters\n ON _team_cmf.cmf__clusters.sha1 = allowed.parent\n JOIN _team_cmf.cmf__models_create_clusters\n ON _team_cmf.cmf__models_create_clusters.child = _team_cmf.cmf__clusters.sha1\n JOIN _team_cmf.cmf__models\n ON _team_cmf.cmf__models_create_clusters.parent = _team_cmf.cmf__models.sha1\n WHERE\n _team_cmf.cmf__models.sha1 = CAST('\\x5666a21720152c92b6b89be7d61e336d4ca684bf' AS BYTEA)\n), recurse(parent, child) AS (\n SELECT\n root.parent AS parent,\n root.child AS child\n FROM root\n UNION\n SELECT\n recurse.parent AS parent,\n allowed.child AS child\n FROM allowed\n JOIN recurse\n ON allowed.parent = recurse.child\n)\nSELECT\n recurse.parent AS cluster_sha1,\n anon_1.data_sha1,\n anon_1.companieshouse_companies_company_name,\n anon_1.companieshouse_companies_company_number,\n anon_1.companieshouse_companies_postcode\nFROM anon_1\nJOIN recurse\n ON recurse.child = anon_1.data_sha1\n" + } + ], + "source": [ + "my_selector = selector(\n", + " table=\"companieshouse.companies\",\n", + " fields=[\"company_name\", \"company_number\", \"postcode\"],\n", + ")\n", + "my_model = \"naive_companies_house_v1\"\n", + "\n", + "compiled = compile_query_to_postgresql(selector=my_selector, model=my_model)\n", + "print(compiled)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Total time: 0:02:12.507736\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": " cluster_sha1 \\\n0 b'\\x00O!\\x9bf\\x91\\xb0\\xfe\\xb9v]\\x0c\\xa3\\xb6l5\\... \n1 b'\\x00]\\x95\\x8a\\xbex\\x1bA\\xa6\\xa5\\xf9\\x88\\x17\\... \n2 b'\\x00af\\x91\\x8f\\x97xH\\xc3\\x9f\\xa6\\r\\x13\\xf1\\x... \n\n data_sha1 \\\n0 b'\\x00O!\\x9bf\\x91\\xb0\\xfe\\xb9v]\\x0c\\xa3\\xb6l5\\... \n1 b'\\x00]\\x95\\x8a\\xbex\\x1bA\\xa6\\xa5\\xf9\\x88\\x17\\... \n2 b'\\x00af\\x91\\x8f\\x97xH\\xc3\\x9f\\xa6\\r\\x13\\xf1\\x... \n\n companieshouse_companies_company_name \\\n0 ARCADE GEEKS INT LTD \n1 LOWELL GROUP SHARED SERVICES LIMITED \n2 KIMDOOLE LTD \n\n companieshouse_companies_company_number companieshouse_companies_postcode \n0 13231865 DY13 9RH \n1 08647094 LS15 8GH \n2 14445223 WC2H 9JQ ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cluster_sha1data_sha1companieshouse_companies_company_namecompanieshouse_companies_company_numbercompanieshouse_companies_postcode
0b'\\x00O!\\x9bf\\x91\\xb0\\xfe\\xb9v]\\x0c\\xa3\\xb6l5\\...b'\\x00O!\\x9bf\\x91\\xb0\\xfe\\xb9v]\\x0c\\xa3\\xb6l5\\...ARCADE GEEKS INT LTD13231865DY13 9RH
1b'\\x00]\\x95\\x8a\\xbex\\x1bA\\xa6\\xa5\\xf9\\x88\\x17\\...b'\\x00]\\x95\\x8a\\xbex\\x1bA\\xa6\\xa5\\xf9\\x88\\x17\\...LOWELL GROUP SHARED SERVICES LIMITED08647094LS15 8GH
2b'\\x00af\\x91\\x8f\\x97xH\\xc3\\x9f\\xa6\\r\\x13\\xf1\\x...b'\\x00af\\x91\\x8f\\x97xH\\xc3\\x9f\\xa6\\r\\x13\\xf1\\x...KIMDOOLE LTD14445223WC2H 9JQ
\n
" + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "start = time.time()\n", + "\n", + "df = cmf.query(\n", + " selector=my_selector, return_type=\"pandas\", model=my_model\n", + ")\n", + "\n", + "elapsed = time.time() - start\n", + "print(f\"Total time: {timedelta(seconds=elapsed)}\")\n", + "df.head(3)" + ] + }, + { + "source": [ + "# Scratch\n", + "\n", + "The below is me messing about. Here be dragons." + ], + "cell_type": "markdown", + "metadata": {} + }, { "cell_type": "code", "execution_count": 45, @@ -823,7 +1077,7 @@ "kernelspec": { "display_name": "Python 3.9.16 64-bit ('company_matching': conda)", "language": "python", - "name": "python_defaultSpec_1710783338709" + "name": "python_defaultSpec_1711550197230" }, "language_info": { "codemirror_mode": {