From 3890bdb731dc9346f4470fab0ff04ce82bc3d5c7 Mon Sep 17 00:00:00 2001 From: John A Stevenson Date: Sun, 12 May 2024 23:00:21 +0100 Subject: [PATCH] Update transform section --- docs/demo_copy.py | 16 +++++----- docs/etl_functions/demo_named_tuple.py | 1 - docs/etl_functions/error_handling.rst | 2 ++ docs/etl_functions/extract.rst | 9 ++++-- docs/etl_functions/transform.rst | 43 +++++++++++++++----------- docs/index.rst | 6 ++-- 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/docs/demo_copy.py b/docs/demo_copy.py index fe16939..7aa9405 100644 --- a/docs/demo_copy.py +++ b/docs/demo_copy.py @@ -15,14 +15,14 @@ )""" -select_sql = "SELECT name FROM igneous_rock" - - def transform(chunk): for row in chunk: - row['category'] = 'igneous' - row['last_update'] = dt.datetime.now() - yield row + new_row = { + "name": row["name"], + "category": "igneous", + "last_update": dt.datetime.now() + } + yield new_row etl.log_to_console() @@ -33,8 +33,8 @@ def transform(chunk): etl.execute(create_sql, dest) # Copy data - rows = etl.iter_rows(select_sql, src, transform=transform) - etl.load('rock', dest, rows) + rows = etl.copy_table_rows('igneous_rock', src, dest, + target='rock', transform=transform) # Confirm transfer for row in etl.fetchall('SELECT * FROM rock', dest): diff --git a/docs/etl_functions/demo_named_tuple.py b/docs/etl_functions/demo_named_tuple.py index aecdb8d..cef0748 100644 --- a/docs/etl_functions/demo_named_tuple.py +++ b/docs/etl_functions/demo_named_tuple.py @@ -3,7 +3,6 @@ import etlhelper as etl from etlhelper.row_factories import namedtuple_row_factory - with sqlite3.connect("igneous_rocks.db") as conn: row = etl.fetchone('SELECT * FROM igneous_rock', conn, row_factory=namedtuple_row_factory) diff --git a/docs/etl_functions/error_handling.rst b/docs/etl_functions/error_handling.rst index d23654b..1dde201 100644 --- a/docs/etl_functions/error_handling.rst +++ b/docs/etl_functions/error_handling.rst @@ -5,6 +5,8 @@ This section describes exception classes and on_error functions. logged errors +also handling errors in SQL e.g. ON CONFLICT + Handling insert errors ---------------------- diff --git a/docs/etl_functions/extract.rst b/docs/etl_functions/extract.rst index c2b6b9c..311774c 100644 --- a/docs/etl_functions/extract.rst +++ b/docs/etl_functions/extract.rst @@ -159,14 +159,14 @@ The ``pyodbc`` driver for MSSQL only supports positional placeholders. When using the ``load`` function in conjuction with ``iter_chunks`` data must be either named tuples or dictionaries. -Transform +transform """"""""" The ``transform`` parameter takes a callable (e.g. function) that transforms the data before returning it. See the :ref:`Transform ` section for details. -Chunk size +chunk_size """""""""" All data extraction functions use ``iter_chunks`` behind the scenes. @@ -174,3 +174,8 @@ This reads rows from the database in *chunks* to prevent them all being loaded into memory at once. The ``chunk_size`` argument sets the number of rows in each chunk. The default ``chunk_size`` is 5000. + +Return values +------------- + +TODO! \ No newline at end of file diff --git a/docs/etl_functions/transform.rst b/docs/etl_functions/transform.rst index de46f16..4f7ed9a 100644 --- a/docs/etl_functions/transform.rst +++ b/docs/etl_functions/transform.rst @@ -3,21 +3,18 @@ Transform ^^^^^^^^^ -Data can be transformed in-flight by applying a transform function. This -is any Python callable (e.g. function or class) that takes an iterator +ETL Helper functions accept a function as the ``transform`` keyword argument, +which enables transformation of data in flight. + +This is any Python callable (e.g. function or class) that takes an iterator and returns another iterator (e.g. list or generator via the ``yield`` -statement). Transform functions are applied to data as they are read +statement). +Transform functions are applied to data as they are read from the database (in the case of data fetching functions and ``copy_rows``), or before they are passed as query parameters (to -``executemany`` or ``load``). When used with ``copy_rows`` or -``executemany`` the INSERT query must contain the correct placeholders -for the transform result. - -The ``iter_chunks`` and ``iter_rows`` functions that are used internally -return generators. Each chunk or row of data is only accessed when it is -required. This allows data transformation to be performed via -`memory-efficient -iterator-chains `__. +``executemany`` or ``load``). +When used with ``copy_rows`` or ``executemany`` the INSERT query must contain +the correct parameter placeholders for the transformed result. The simplest transform functions modify data returned mutable row factories e.g., ``dict_row_factory`` in-place. The ``yield`` keyword @@ -27,6 +24,7 @@ that can loop over the rows. .. code:: python from typing import Iterator + import etlhelper as etl from etlhelper.row_factories import dict_row_factory @@ -40,11 +38,12 @@ that can loop over the rows. yield row - fetchall(select_sql, src_conn, row_factory=dict_row_factory, - transform=my_transform) + etl.fetchall(select_sql, src_conn, row_factory=dict_row_factory, + transform=my_transform) It is also possible to assemble the complete transformed chunk and -return it. This code demonstrates that the returned chunk can have a +return it. +This code demonstrates that the returned chunk can have a different number of rows, and be of different length, to the input. Because ``namedtuple``\ s are immutable, we have to create a ``new_row`` from each input ``row``. @@ -53,6 +52,7 @@ from each input ``row``. import random from typing import Iterator + import etlhelper as etl from etlhelper.row_factories import namedtuple_row_factory @@ -68,10 +68,17 @@ from each input ``row``. return new_chunk - fetchall(select_sql, src_conn, row_factory=namedtuple_row_factory, - transform=my_transform) + etl.fetchall(select_sql, src_conn, row_factory=namedtuple_row_factory, + transform=my_transform) Any Python code can be used within the function and extra data can result from a calculation, a call to a webservice or a query against -another database. As a standalone function with known inputs and +another database. +As a standalone function with known inputs and outputs, the transform functions are also easy to test. + +The ``iter_chunks`` and ``iter_rows`` functions return generators. +Each chunk or row of data is only accessed when it is +required. This allows data transformation to be performed via +`memory-efficient +iterator-chains `__. diff --git a/docs/index.rst b/docs/index.rst index 17a3413..54c0a99 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,7 +18,7 @@ Welcome to ETL Helper's documentation! :target: https://pypi.org/project/etlhelper .. image:: https://img.shields.io/pypi/dm/etlhelper?label=Downloads%20pypi -ETL Helper is a Python ETL library to simplify data transfer into and out of databases. +ETL Helper is a Python ETL (Extract, Transform, Load) library to simplify data transfer into and out of databases. .. note:: This documentation is a work in progress in preparation for the upcoming 1.0 release. @@ -94,7 +94,7 @@ The output is: Copying data ------------ -This script copies the data to another database, with transformation and logging. +This script copies data to another database, with transformation and logging. .. literalinclude:: demo_copy.py :language: python @@ -112,3 +112,5 @@ The output is: {'id': 1, 'name': 'basalt', 'category': 'igneous', 'last_update': '2024-05-08 14:59:54.878726'} {'id': 2, 'name': 'granite', 'category': 'igneous', 'last_update': '2024-05-08 14:59:54.879034'} + +The :doc:`recipes` section has more example code. \ No newline at end of file