From 743889f394e5407eb7c73bb3ae7063ab71992dde Mon Sep 17 00:00:00 2001 From: John A Stevenson Date: Sat, 11 May 2024 23:15:20 +0100 Subject: [PATCH] Refactor Extract section --- docs/etl_functions/demo_named_tuple.py | 12 +++ docs/etl_functions/extract.rst | 125 ++++++++++++++----------- 2 files changed, 84 insertions(+), 53 deletions(-) create mode 100644 docs/etl_functions/demo_named_tuple.py diff --git a/docs/etl_functions/demo_named_tuple.py b/docs/etl_functions/demo_named_tuple.py new file mode 100644 index 0000000..aecdb8d --- /dev/null +++ b/docs/etl_functions/demo_named_tuple.py @@ -0,0 +1,12 @@ +"""Extract script using namedtuple_row_factory""" +import sqlite3 +import etlhelper as etl +from etlhelper.row_factories import namedtuple_row_factory + + +with sqlite3.connect("igneous_rocks.db") as conn: + row = etl.fetchone('SELECT * FROM igneous_rock', conn, + row_factory=namedtuple_row_factory) + +print(row) +print(row.name) diff --git a/docs/etl_functions/extract.rst b/docs/etl_functions/extract.rst index eba899c..c2b6b9c 100644 --- a/docs/etl_functions/extract.rst +++ b/docs/etl_functions/extract.rst @@ -65,62 +65,83 @@ returns {'id': 2, 'name': 'granite', 'grain_size': 'coarse'} -Parameters ----------- +Keyword arguments +----------------- + +All extract functions are derived from :func:`iter_chunks() ` +and take the same keyword arguments, which are passed through. + +parameters +"""""""""" Variables can be inserted into queries by passing them as parameters. These “bind variables” are sanitised by the underlying drivers to -prevent `SQL injection attacks `__. The required +prevent `SQL injection attacks `__. + +It is necessary to use the correct `paramstyle `__ -can be checked with ``MY_DB.paramstyle``. A tuple is used for positional -placeholders, or a dictionary for named placeholders. +for the database type as a placeholder (e.g. ``?``, ``:1``). -.. code:: python +The paramstyle for a DbParams object can be checked with the +:func:`paramstyle ` attribute. - select_sql = "SELECT * FROM src WHERE id = :id" +A dictionary is used for named placeholders, - with ORACLEDB.connect("ORA_PASSWORD") as conn: - fetchall(sql, conn, parameters={'id': 1}) +.. code:: python -Row factories -------------- + select_sql = "SELECT * FROM src WHERE id = :id" # SQLite style -Row factories control the output format of returned rows. + with sqlite3.connect("rocks.db") as conn: + etl.fetchall(sql, conn, parameters={'id': 1}) -For example return each row as a dictionary, use the following: +or a tuple for positional placeholders. .. code:: python - from etlhelper import fetchall - from etlhelper.row_factories import dict_row_factory + select_sql = "SELECT * FROM src WHERE id = ?" # SQLite style + + with sqlite3.connect("rocks.db") as conn: + etl.fetchall(sql, conn, parameters=(1,)) + - sql = "SELECT * FROM my_table" +Named parameters result in more readable code. - with ORACLEDB.connect('ORACLE_PASSWORD') as conn: - for row in fetchall(sql, conn, row_factory=dict_row_factory): - print(row['id']) -The ``dict_row_factory`` is useful when data are to be serialised to -JSON/YAML, as those formats use dictionaries as input. +row_factory +""""""""""" + +Row factories control the output format of returned rows. +The default row factory for ETL Helper is a dictionary, but this can be +changed with the ``row_factory`` argument. + +.. literalinclude:: demo_named_tuple.py + :language: python + +The output is: + +.. code:: bash + + Row(id=1, name='basalt', grain_size='fine') + basalt Four different row_factories are included, based in built-in Python types: -+------------------+------------------+---------+------------------+ -| Row Factory | Attribute access | Mutable | Parameter | -| | | | placeholder | -+==================+==================+=========+==================+ -| dict_row_factory | ``row["id"]`` | Yes | Named | -| (default) | | | | -+------------------+------------------+---------+------------------+ -| t | ``row[0]`` | No | Positional | -| uple_row_factory | | | | -+------------------+------------------+---------+------------------+ -| list_row_factory | ``row[0]`` | Yes | Positional | -+------------------+------------------+---------+------------------+ -| namedt | ``row.id`` or | No | Positional | -| uple_row_factory | ``row[0]`` | | | -+------------------+------------------+---------+------------------+ ++-----------------------+------------------+---------+------------------+ +| Row Factory | Attribute access | Mutable | Parameter | +| | | | placeholder | ++=======================+==================+=========+==================+ +| dict_row_factory | ``row["id"]`` | Yes | Named | +| (default) | | | | ++-----------------------+------------------+---------+------------------+ +| tuple_row_factory | ``row[0]`` | No | Positional | ++-----------------------+------------------+---------+------------------+ +| list_row_factory | ``row[0]`` | Yes | Positional | ++-----------------------+------------------+---------+------------------+ +| namedtuple_row_factory| ``row.id`` or | No | Positional (or | +| | ``row[0]`` | | Named with | +| | | | *load*) | ++-----------------------+------------------+---------+------------------+ The choice of row factory depends on the use case. In general named tuples and dictionaries are best for readable code, while using tuples @@ -128,30 +149,28 @@ or lists can give a slight increase in performance. Mutable rows are convenient when used with transform functions because they can be modified without need to create a whole new output row. -When using ``copy_rows``, it is necessary to use appropriate parameter -placeholder style for the chosen row factory in the INSERT query. Using -the ``dict_row_factory`` requires a switch from named to positional -parameter placeholders (e.g. ``%(id)s`` instead of ``%s`` for -PostgreSQL, ``:id`` instead of ``:1`` for Oracle). The ``pyodbc`` driver -for MSSQL only supports positional placeholders. +When loading or copying data, it is necessary to use appropriate parameter +placeholder style for the chosen row factory in the INSERT query. +Using the ``tuple_row_factory`` requires a switch from named to positional +parameter placeholders (e.g. ``%s`` instead of ``%(id)s`` for PostgreSQL, +``:1`` instead of ``:id`` for Oracle). +The ``pyodbc`` driver for MSSQL only supports positional placeholders. When using the ``load`` function in conjuction with ``iter_chunks`` data must be either named tuples or dictionaries. Transform ---------- +""""""""" -The ``transform`` parameter allows passing of a function to transform -the data before returning it. The function must take a list of rows and -return a list of modified rows. Rows of mutable types (dict, list) can -be modified in-place, while rows of immutable types (tuples, -namedtuples) must be created as new objects from the input rows. See -``transform`` for more details. +The ``transform`` parameter takes a callable (e.g. function) that +transforms the data before returning it. +See the :ref:`Transform ` section for details. Chunk size ----------- +"""""""""" All data extraction functions use ``iter_chunks`` behind the scenes. -This reads rows from the database in chunks to prevent them all being -loaded into memory at once. The default ``chunk_size`` is 5000 and this -can be set via keyword argument. +This reads rows from the database in *chunks* to prevent them all being +loaded into memory at once. +The ``chunk_size`` argument sets the number of rows in each chunk. +The default ``chunk_size`` is 5000.