diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 911df022..00000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-version: 2
-jobs:
- build:
- docker:
- - image: circleci/python:3.6.2
- steps:
- - checkout
-
- - run:
- name: install dependencies
- command: |
- python3 -m venv venv
- . venv/bin/activate
- pip install --upgrade pip setuptools wheel
- pip install .[test]
-
- - run:
- name: 'Pylinting'
- command: |
- . venv/bin/activate
- pylint target_snowflake
-
- - run:
- name: 'Unit Tests'
- command: |
- . venv/bin/activate
- export LOGGING_CONF_FILE=$(pwd)/sample_logging.conf
- pytest tests/unit -vv --cov target_snowflake --cov-fail-under=55
-
- - run:
- name: 'Integration Tests'
- command: |
- . venv/bin/activate
- export LOGGING_CONF_FILE=$(pwd)/sample_logging.conf
- pytest tests/integration/ -vv --cov target_snowflake --cov-fail-under=86
-
-workflows:
- version: 2
- build:
- jobs:
- - build
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..786c3490
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,74 @@
+name: CI
+
+on:
+ pull_request:
+ push:
+ branches:
+ - master
+
+jobs:
+ lint_and_test:
+ name: Linting and Testing
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: [ 3.7, 3.8, 3.9 ]
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Setup virtual environment
+ run: make venv
+
+ - name: Pylinting
+ run: make pylint
+
+ - name: Unit Tests
+ run: make unit_test
+
+ integration_test:
+ name: Integration Testing
+ runs-on: ubuntu-latest
+ environment: ci_tests
+ strategy:
+ matrix:
+ python-version: [ 3.8 ]
+ concurrency:
+ group: integration_tests-${{ github.head_ref }}
+ cancel-in-progress: true
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Setup virtual environment
+ run: make venv
+
+ - name: Integration tests
+ run: make integration_test
+ env:
+ TARGET_SNOWFLAKE_ACCOUNT: ${{ secrets.TARGET_SNOWFLAKE_ACCOUNT }}
+ TARGET_SNOWFLAKE_DBNAME: ${{ secrets.TARGET_SNOWFLAKE_DBNAME }}
+ TARGET_SNOWFLAKE_USER: ${{ secrets.TARGET_SNOWFLAKE_USER }}
+ TARGET_SNOWFLAKE_PASSWORD: ${{ secrets.TARGET_SNOWFLAKE_PASSWORD }}
+ TARGET_SNOWFLAKE_WAREHOUSE: ${{ secrets.TARGET_SNOWFLAKE_WAREHOUSE }}
+ TARGET_SNOWFLAKE_SCHEMA: ${{ secrets.TARGET_SNOWFLAKE_SCHEMA }}
+ TARGET_SNOWFLAKE_AWS_ACCESS_KEY: ${{ secrets.TARGET_SNOWFLAKE_AWS_ACCESS_KEY }}
+ TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY: ${{ secrets.TARGET_SNOWFLAKE_AWS_SECRET_ACCESS_KEY }}
+ TARGET_SNOWFLAKE_S3_BUCKET: ${{ secrets.TARGET_SNOWFLAKE_S3_BUCKET }}
+ TARGET_SNOWFLAKE_S3_KEY_PREFIX: ${{ secrets.TARGET_SNOWFLAKE_S3_KEY_PREFIX }}
+ TARGET_SNOWFLAKE_STAGE: ${{ secrets.TARGET_SNOWFLAKE_STAGE }}
+ TARGET_SNOWFLAKE_FILE_FORMAT_CSV: ${{ secrets.TARGET_SNOWFLAKE_FILE_FORMAT_CSV }}
+ TARGET_SNOWFLAKE_FILE_FORMAT_PARQUET: ${{ secrets.TARGET_SNOWFLAKE_FILE_FORMAT_PARQUET }}
+ CLIENT_SIDE_ENCRYPTION_MASTER_KEY: ${{ secrets.TARGET_SNOWFLAKE_CLIENT_SIDE_ENCRYPTION_MASTER_KEY }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c2dc6a48..152ab5e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,84 @@
+2.3.0 (2023-08-08)
+-------------------
+
+*Changes*
+- Update dependencies:
+ - snowflake-connector-python[pandas]
+ - boto3
+ - pytest
+ - python-dotenv
+
+
+2.2.0 (2022-05-12)
+-------------------
+
+*Changes*
+- Revert use of `ujson`
+
+
+2.1.0 (2022-05-05)
+-------------------
+
+*Changes*
+- Use `usjon` for JSON encoding/decoding
+
+2.0.1 (2022-04-08)
+-------------------
+
+*Fixes*
+- Only drop pk constraint if table has one
+- Don't raise `PrimaryKeyNotFoundException` when a record has a flasy pk value
+
+
+2.0.0 (2022-03-29)
+-------------------
+
+*Fixes*
+- Respecting `flush_all_streams` when SCHEMA messages arrive.
+- Improve logging for failed merge & copy queries.
+- Drop NOT NULL constraint from primary key columns.
+- Update PK constraints according to changes to SCHEMA's key properties.
+
+*Changes*
+- Dropping support for Python 3.6
+- Adding support for Python 3.9
+- Bump pytest to `7.1.1`
+- Bump boto3 to `1.21`
+
+
+1.15.0 (2022-01-14)
+-------------------
+
+*Added*
+- Support parallelism for table stages
+
+*Fixes*
+- Emit last encountered state message if there are no records.
+
+*Changes*
+- Migrate CI to github actions
+- Bump dependencies
+
+
+1.14.1 (2021-10-14)
+-------------------
+- Increase `max_records` when selecting columns by an order of magnitude
+- Bumping dependencies
+
+1.14.0 (2021-09-30)
+-------------------
+- Add support for `date` property format
+- Stop logging record when error happens
+
+1.13.1 (2021-07-15)
+-------------------
+- Fixed an issue with S3 metadata required for decryption not being included in archived load files.
+
+1.13.0 (2021-06-23)
+-------------------
+- Add `archive_load_files` parameter to optionally archive load files on S3
+- Bumping dependencies
+
1.12.0 (2021-04-12)
-------------------
- Add optional `batch_wait_limit_seconds` parameter
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..c8b5d069
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+venv:
+ python3 -m venv venv ;\
+ . ./venv/bin/activate ;\
+ pip install --upgrade pip setuptools wheel ;\
+ pip install -e .[test]
+
+pylint:
+ . ./venv/bin/activate ;\
+ pylint --rcfile pylintrc target_snowflake/
+
+unit_test:
+ . ./venv/bin/activate ;\
+ pytest tests/unit -vv --cov target_snowflake --cov-fail-under=67
+
+integration_test:
+ . ./venv/bin/activate ;\
+ pytest tests/integration/ -vvx --cov target_snowflake --cov-fail-under=86
diff --git a/README.md b/README.md
index 7b665a6e..0a7d3674 100644
--- a/README.md
+++ b/README.md
@@ -160,14 +160,13 @@ Full list of options in `config.json`:
| batch_size_rows | Integer | | (Default: 100000) Maximum number of rows in each batch. At the end of each batch, the rows in the batch are loaded into Snowflake. |
| batch_wait_limit_seconds | Integer | | (Default: None) Maximum time to wait for batch to reach `batch_size_rows`. |
| flush_all_streams | Boolean | | (Default: False) Flush and load every stream into Snowflake when one batch is full. Warning: This may trigger the COPY command to use files with low number of records, and may cause performance problems. |
-| parallelism | Integer | | (Default: 0) The number of threads used to flush tables. 0 will create a thread for each stream, up to parallelism_max. -1 will create a thread for each CPU core. Any other positive number will create that number of threads, up to parallelism_max. **Parallelism works only with external stages. If no s3_bucket defined with an external stage then flusing tables is enforced to use a single thread.**|
+| parallelism | Integer | | (Default: 0) The number of threads used to flush tables. 0 will create a thread for each stream, up to parallelism_max. -1 will create a thread for each CPU core. Any other positive number will create that number of threads, up to parallelism_max. |
| parallelism_max | Integer | | (Default: 16) Max number of parallel threads to use when flushing tables. |
| default_target_schema | String | | Name of the schema where the tables will be created, **without** database prefix. If `schema_mapping` is not defined then every stream sent by the tap is loaded into this schema. |
| default_target_schema_select_permission | String | | Grant USAGE privilege on newly created schemas and grant SELECT privilege on newly created tables to a specific role or a list of roles. If `schema_mapping` is not defined then every stream sent by the tap is granted accordingly. |
| schema_mapping | Object | | Useful if you want to load multiple streams from one tap to multiple Snowflake schemas.
If the tap sends the `stream_id` in `-` format then this option overwrites the `default_target_schema` value. Note, that using `schema_mapping` you can overwrite the `default_target_schema_select_permission` value to grant SELECT permissions to different groups per schemas or optionally you can create indices automatically for the replicated tables.
**Note**: This is an experimental feature and recommended to use via PipelineWise YAML files that will generate the object mapping in the right JSON format. For further info check a [PipelineWise YAML Example]
| disable_table_cache | Boolean | | (Default: False) By default the connector caches the available table structures in Snowflake at startup. In this way it doesn't need to run additional queries when ingesting data to check if altering the target tables is required. With `disable_table_cache` option you can turn off this caching. You will always see the most recent table structures but will cause an extra query runtime. |
| client_side_encryption_master_key | String | | (Default: None) When this is defined, Client-Side Encryption is enabled. The data in S3 will be encrypted, No third parties, including Amazon AWS and any ISPs, can see data in the clear. Snowflake COPY command will decrypt the data once it's in Snowflake. The master key must be 256-bit length and must be encoded as base64 string. |
-| client_side_encryption_stage_object | String | | (Default: None) Required when `client_side_encryption_master_key` is defined. The name of the encrypted stage object in Snowflake that created separately and using the same encryption master key. |
| add_metadata_columns | Boolean | | (Default: False) Metadata columns add extra row level information about data ingestions, (i.e. when was the row read in source, when was inserted or deleted in snowflake etc.) Metadata columns are creating automatically by adding extra columns to the tables with a column prefix `_SDC_`. The column names are following the stitch naming conventions documented at https://www.stitchdata.com/docs/data-structure/integration-schemas#sdc-columns. Enabling metadata columns will flag the deleted rows by setting the `_SDC_DELETED_AT` metadata column. Without the `add_metadata_columns` option the deleted rows from singer taps will not be recongisable in Snowflake. |
| hard_delete | Boolean | | (Default: False) When `hard_delete` option is true then DELETE SQL commands will be performed in Snowflake to delete rows in tables. It's achieved by continuously checking the `_SDC_DELETED_AT` metadata column sent by the singer tap. Due to deleting rows requires metadata columns, `hard_delete` option automatically enables the `add_metadata_columns` option as well. |
| data_flattening_max_level | Integer | | (Default: 0) Object type RECORD items from taps can be loaded into VARIANT columns as JSON (default) or we can flatten the schema by creating columns automatically.
When value is 0 (default) then flattening functionality is turned off. |
@@ -176,6 +175,9 @@ Full list of options in `config.json`:
| temp_dir | String | | (Default: platform-dependent) Directory of temporary files with RECORD messages. |
| no_compression | Boolean | | (Default: False) Generate uncompressed files when loading to Snowflake. Normally, by default GZIP compressed files are generated. |
| query_tag | String | | (Default: None) Optional string to tag executed queries in Snowflake. Replaces tokens `{{database}}`, `{{schema}}` and `{{table}}` with the appropriate values. The tags are displayed in the output of the Snowflake `QUERY_HISTORY`, `QUERY_HISTORY_BY_*` functions. |
+| archive_load_files | Boolean | | (Default: False) When enabled, the files loaded to Snowflake will also be stored in `archive_load_files_s3_bucket` under the key `/{archive_load_files_s3_prefix}/{schema_name}/{table_name}/`. All archived files will have `tap`, `schema`, `table` and `archived-by` as S3 metadata keys. When incremental replication is used, the archived files will also have the following S3 metadata keys: `incremental-key`, `incremental-key-min` and `incremental-key-max`.
+| archive_load_files_s3_prefix | String | | (Default: "archive") When `archive_load_files` is enabled, the archived files will be placed in the archive S3 bucket under this prefix.
+| archive_load_files_s3_bucket | String | | (Default: Value of `s3_bucket`) When `archive_load_files` is enabled, the archived files will be placed in this bucket.
### To run tests:
@@ -184,7 +186,7 @@ Full list of options in `config.json`:
export TARGET_SNOWFLAKE_ACCOUNT=
export TARGET_SNOWFLAKE_DBNAME=
export TARGET_SNOWFLAKE_USER=
- export TARGET_SNOWFLAKE_PASSWORD=
+ export TARGET_SNOWFLAKE_PASSWORD=
export TARGET_SNOWFLAKE_WAREHOUSE=
export TARGET_SNOWFLAKE_SCHEMA=
export TARGET_SNOWFLAKE_AWS_ACCESS_KEY=
@@ -196,7 +198,6 @@ Full list of options in `config.json`:
export TARGET_SNOWFLAKE_FILE_FORMAT_CSV=
export TARGET_SNOWFLAKE_FILE_FORMAT_PARQUET=
export CLIENT_SIDE_ENCRYPTION_MASTER_KEY=
- export CLIENT_SIDE_ENCRYPTION_STAGE_OBJECT=
```
2. Install python test dependencies in a virtual env and run unit and integration tests
diff --git a/setup.py b/setup.py
index 368ba449..001bdf30 100644
--- a/setup.py
+++ b/setup.py
@@ -6,32 +6,34 @@
long_description = f.read()
setup(name="pipelinewise-target-snowflake",
- version="1.12.0",
+ version="2.3.0",
description="Singer.io target for loading data to Snowflake - PipelineWise compatible",
long_description=long_description,
long_description_content_type='text/markdown',
- author="TransferWise",
+ author="Wise",
url='https://github.com/transferwise/pipelinewise-target-snowflake',
classifiers=[
'License :: OSI Approved :: Apache Software License',
- 'Programming Language :: Python :: 3 :: Only'
+ 'Programming Language :: Python :: 3 :: Only',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
],
py_modules=["target_snowflake"],
+ python_requires='>=3.7',
install_requires=[
'pipelinewise-singer-python==1.*',
- 'snowflake-connector-python[pandas]==2.4.2',
+ 'snowflake-connector-python[pandas]==3.0.4',
'inflection==0.5.1',
- 'joblib==1.0.1',
- 'numpy<1.21.0',
- 'python-dateutil==2.8.1'
+ 'joblib==1.2.0',
+ 'boto3==1.28.20',
],
extras_require={
"test": [
- "mock==4.0.3",
- "pylint==2.7.4",
- 'pytest==6.2.3',
- 'pytest-cov==2.11.1',
- "python-dotenv==0.17.0"
+ "pylint==2.12.*",
+ 'pytest==7.4.0',
+ 'pytest-cov==3.0.0',
+ "python-dotenv>=0.19,<1.1"
]
},
entry_points="""
diff --git a/target_snowflake/__init__.py b/target_snowflake/__init__.py
index 19cfd069..4f502912 100644
--- a/target_snowflake/__init__.py
+++ b/target_snowflake/__init__.py
@@ -8,15 +8,15 @@
import sys
import copy
-from typing import Dict, List
+from typing import Dict, List, Optional
from joblib import Parallel, delayed, parallel_backend
from jsonschema import Draft7Validator, FormatChecker
from singer import get_logger
from datetime import datetime, timedelta
-import target_snowflake.file_formats.csv as csv
-import target_snowflake.file_formats.parquet as parquet
-import target_snowflake.stream_utils as stream_utils
+from target_snowflake.file_formats import csv
+from target_snowflake.file_formats import parquet
+from target_snowflake import stream_utils
from target_snowflake.db_sync import DbSync
from target_snowflake.file_format import FileFormatTypes
@@ -53,12 +53,12 @@ def add_metadata_columns_to_schema(schema_message):
return extended_schema_message
-def emit_state(state):
+def emit_state(state: Optional[Dict]):
"""Print state to stdout"""
if state is not None:
line = json.dumps(state)
LOGGER.info('Emitting state %s', line)
- sys.stdout.write("{}\n".format(line))
+ sys.stdout.write(f"{line}\n")
sys.stdout.flush()
@@ -75,7 +75,7 @@ def get_snowflake_statics(config):
if not ('disable_table_cache' in config and config['disable_table_cache']):
LOGGER.info('Getting catalog objects from table cache...')
- db = DbSync(config) # pylint: disable=invalid-name
+ db = DbSync(config) # pylint: disable=invalid-name
table_cache = db.get_table_columns(
table_schemas=stream_utils.get_schema_names_from_config(config))
@@ -84,8 +84,9 @@ def get_snowflake_statics(config):
return table_cache, file_format_type
+
# pylint: disable=too-many-locals,too-many-branches,too-many-statements,invalid-name
-def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatTypes=None) -> None:
+def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatTypes = None) -> None:
"""Main loop to read and consume singer messages from stdin
Params:
@@ -114,6 +115,8 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)
batch_wait_limit_seconds = config.get('batch_wait_limit_seconds', None)
flush_timestamp = datetime.utcnow()
+ archive_load_files = config.get('archive_load_files', False)
+ archive_load_files_data = {}
# Loop over lines from stdin
for line in lines:
@@ -160,7 +163,7 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
primary_key_string = stream_to_sync[stream].record_primary_key_string(o['record'])
if not primary_key_string:
- primary_key_string = 'RID-{}'.format(total_row_count[stream])
+ primary_key_string = f'RID-{total_row_count[stream]}'
if stream not in records_to_load:
records_to_load[stream] = {}
@@ -177,6 +180,21 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
else:
records_to_load[stream][primary_key_string] = o['record']
+ if archive_load_files and stream in archive_load_files_data:
+ # Keep track of min and max of the designated column
+ stream_archive_load_files_values = archive_load_files_data[stream]
+ if 'column' in stream_archive_load_files_values:
+ incremental_key_column_name = stream_archive_load_files_values['column']
+ incremental_key_value = o['record'][incremental_key_column_name]
+ min_value = stream_archive_load_files_values['min']
+ max_value = stream_archive_load_files_values['max']
+
+ if min_value is None or min_value > incremental_key_value:
+ stream_archive_load_files_values['min'] = incremental_key_value
+
+ if max_value is None or max_value < incremental_key_value:
+ stream_archive_load_files_values['max'] = incremental_key_value
+
flush = False
if row_count[stream] >= batch_size_rows:
flush = True
@@ -203,6 +221,7 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
config,
state,
flushed_state,
+ archive_load_files_data,
filter_streams=filter_streams)
flush_timestamp = datetime.utcnow()
@@ -231,12 +250,19 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
# if same stream has been encountered again, it means the schema might have been altered
# so previous records need to be flushed
if row_count.get(stream, 0) > 0:
+ # flush all streams, delete records if needed, reset counts and then emit current state
+ if config.get('flush_all_streams'):
+ filter_streams = None
+ else:
+ filter_streams = [stream]
flushed_state = flush_streams(records_to_load,
row_count,
stream_to_sync,
config,
state,
- flushed_state)
+ flushed_state,
+ archive_load_files_data,
+ filter_streams=filter_streams)
# emit latest encountered state
emit_state(flushed_state)
@@ -267,6 +293,27 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
else:
stream_to_sync[stream] = DbSync(config, o, table_cache, file_format_type)
+ if archive_load_files:
+ archive_load_files_data[stream] = {
+ 'tap': config.get('tap_id'),
+ }
+
+ # In case of incremental replication, track min/max of the replication key.
+ # Incremental replication is assumed if o['bookmark_properties'][0] is one of the columns.
+ incremental_key_column_name = stream_utils.get_incremental_key(o)
+ if incremental_key_column_name:
+ LOGGER.info("Using %s as incremental_key_column_name", incremental_key_column_name)
+ archive_load_files_data[stream].update(
+ column=incremental_key_column_name,
+ min=None,
+ max=None
+ )
+ else:
+ LOGGER.warning(
+ "archive_load_files is enabled, but no incremental_key_column_name was found. "
+ "Min/max values will not be added to metadata for stream %s.", stream
+ )
+
stream_to_sync[stream].create_schema_if_not_exists()
stream_to_sync[stream].sync_table()
@@ -280,8 +327,8 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
LOGGER.debug('Setting state to %s', o['value'])
state = o['value']
- # Initially set flushed state
- if not flushed_state:
+ # # set flushed state if it's not defined or there are no records so far
+ if not flushed_state or sum(row_count.values()) == 0:
flushed_state = copy.deepcopy(state)
else:
@@ -291,7 +338,8 @@ def persist_lines(config, lines, table_cache=None, file_format_type: FileFormatT
# then flush all buckets.
if sum(row_count.values()) > 0:
# flush all streams one last time, delete records if needed, reset counts and then emit current state
- flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state)
+ flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state,
+ archive_load_files_data)
# emit latest state
emit_state(copy.deepcopy(flushed_state))
@@ -305,6 +353,7 @@ def flush_streams(
config,
state,
flushed_state,
+ archive_load_files_data,
filter_streams=None):
"""
Flushes all buckets and resets records count to 0 as well as empties records to load list
@@ -315,6 +364,7 @@ def flush_streams(
:param state: dictionary containing the original state from tap
:param flushed_state: dictionary containing updated states only when streams got flushed
:param filter_streams: Keys of streams to flush from the streams dict. Default is every stream
+ :param archive_load_files_data: dictionary of dictionaries containing archive load files data
:return: State dict with flushed positions
"""
parallelism = config.get("parallelism", DEFAULT_PARALLELISM)
@@ -347,7 +397,8 @@ def flush_streams(
db_sync=stream_to_sync[stream],
no_compression=config.get('no_compression'),
delete_rows=config.get('hard_delete'),
- temp_dir=config.get('temp_dir')
+ temp_dir=config.get('temp_dir'),
+ archive_load_files=copy.copy(archive_load_files_data.get(stream, None))
) for stream in streams_to_flush)
# reset flushed stream records to empty to avoid flushing same records
@@ -368,16 +419,20 @@ def flush_streams(
else:
flushed_state = copy.deepcopy(state)
+ if stream in archive_load_files_data:
+ archive_load_files_data[stream]['min'] = None
+ archive_load_files_data[stream]['max'] = None
+
# Return with state message with flushed positions
return flushed_state
def load_stream_batch(stream, records, row_count, db_sync, no_compression=False, delete_rows=False,
- temp_dir=None):
+ temp_dir=None, archive_load_files=None):
"""Load one batch of the stream into target table"""
# Load into snowflake
if row_count[stream] > 0:
- flush_records(stream, records, db_sync, temp_dir, no_compression)
+ flush_records(stream, records, db_sync, temp_dir, no_compression, archive_load_files)
# Delete soft-deleted, flagged rows - where _sdc_deleted at is not null
if delete_rows:
@@ -391,7 +446,8 @@ def flush_records(stream: str,
records: List[Dict],
db_sync: DbSync,
temp_dir: str = None,
- no_compression: bool = False) -> None:
+ no_compression: bool = False,
+ archive_load_files: Dict = None) -> None:
"""
Takes a list of record messages and loads it into the snowflake target table
@@ -401,8 +457,9 @@ def flush_records(stream: str,
column value
row_count:
db_sync: A DbSync object
- temp_dir: Directory where intermediate temporary files will be created. (Default: OS specificy temp directory)
+ temp_dir: Directory where intermediate temporary files will be created. (Default: OS specific temp directory)
no_compression: Disable to use compressed files. (Default: False)
+ archive_load_files: Data needed for archive load files. (Default: None)
Returns:
None
@@ -413,7 +470,7 @@ def flush_records(stream: str,
compression=not no_compression,
dest_dir=temp_dir,
data_flattening_max_level=
- db_sync.data_flattening_max_level)
+ db_sync.data_flattening_max_level)
# Get file stats
row_count = len(records)
@@ -423,8 +480,39 @@ def flush_records(stream: str,
s3_key = db_sync.put_to_stage(filepath, stream, row_count, temp_dir=temp_dir)
db_sync.load_file(s3_key, row_count, size_bytes)
- # Delete file from local disk and from s3
+ # Delete file from local disk
os.remove(filepath)
+
+ if archive_load_files:
+ stream_name_parts = stream_utils.stream_name_to_dict(stream)
+ if 'schema_name' not in stream_name_parts or 'table_name' not in stream_name_parts:
+ raise Exception(f"Failed to extract schema and table names from stream '{stream}'")
+
+ archive_schema = stream_name_parts['schema_name']
+ archive_table = stream_name_parts['table_name']
+ archive_tap = archive_load_files['tap']
+
+ archive_metadata = {
+ 'tap': archive_tap,
+ 'schema': archive_schema,
+ 'table': archive_table,
+ 'archived-by': 'pipelinewise_target_snowflake'
+ }
+
+ if 'column' in archive_load_files:
+ archive_metadata.update({
+ 'incremental-key': archive_load_files['column'],
+ 'incremental-key-min': str(archive_load_files['min']),
+ 'incremental-key-max': str(archive_load_files['max'])
+ })
+
+ # Use same file name as in import
+ archive_file = os.path.basename(s3_key)
+ archive_key = f"{archive_tap}/{archive_table}/{archive_file}"
+
+ db_sync.copy_to_archive(s3_key, archive_key, archive_metadata)
+
+ # Delete file from S3
db_sync.delete_from_stage(stream, s3_key)
@@ -435,7 +523,7 @@ def main():
args = arg_parser.parse_args()
if args.config:
- with open(args.config) as config_input:
+ with open(args.config, encoding="utf8") as config_input:
config = json.load(config_input)
else:
config = {}
diff --git a/target_snowflake/db_sync.py b/target_snowflake/db_sync.py
index 4689785d..ffac46c5 100644
--- a/target_snowflake/db_sync.py
+++ b/target_snowflake/db_sync.py
@@ -1,17 +1,16 @@
import json
import sys
-from typing import List, Dict, Union
-
import snowflake.connector
import re
import time
+from typing import List, Dict, Union, Tuple, Set
from singer import get_logger
-import target_snowflake.flattening as flattening
-import target_snowflake.stream_utils as stream_utils
+from target_snowflake import flattening
+from target_snowflake import stream_utils
from target_snowflake.file_format import FileFormat, FileFormatTypes
-from target_snowflake.exceptions import TooManyRecordsException
+from target_snowflake.exceptions import TooManyRecordsException, PrimaryKeyNotFoundException
from target_snowflake.upload_clients.s3_upload_client import S3UploadClient
from target_snowflake.upload_clients.snowflake_upload_client import SnowflakeUploadClient
@@ -55,7 +54,7 @@ def validate_config(config):
# Check if mandatory keys exist
for k in required_config_keys:
if not config.get(k, None):
- errors.append("Required key is missing from config: [{}]".format(k))
+ errors.append(f"Required key is missing from config: [{k}]")
# Check target schema config
config_default_target_schema = config.get('default_target_schema', None)
@@ -63,6 +62,11 @@ def validate_config(config):
if not config_default_target_schema and not config_schema_mapping:
errors.append("Neither 'default_target_schema' (string) nor 'schema_mapping' (object) keys set in config.")
+ # Check if archive load files option is using external stages
+ archive_load_files = config.get('archive_load_files', False)
+ if archive_load_files and not config.get('s3_bucket', None):
+ errors.append('Archive load files option can be used only with external s3 stages. Please define s3_bucket.')
+
return errors
@@ -77,6 +81,8 @@ def column_type(schema_property):
# Every date-time JSON value is currently mapped to TIMESTAMP_NTZ
elif property_format == 'date-time':
col_type = 'timestamp_ntz'
+ elif property_format == 'date':
+ col_type = 'date'
elif property_format == 'time':
col_type = 'time'
elif property_format == 'binary':
@@ -107,15 +113,17 @@ def column_trans(schema_property):
def safe_column_name(name):
"""Generate SQL friendly column name"""
- return '"{}"'.format(name).upper()
+ return f'"{name}"'.upper()
+
def json_element_name(name):
"""Generate SQL friendly semi structured element reference name"""
- return '"{}"'.format(name)
+ return f'"{name}"'
+
def column_clause(name, schema_property):
"""Generate DDL column name with column type string"""
- return '{} {}'.format(safe_column_name(name), column_type(schema_property))
+ return f'{safe_column_name(name)} {column_type(schema_property)}'
def primary_column_names(stream_schema_message):
@@ -208,7 +216,7 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
self.file_format = FileFormat(self.connection_config['file_format'], self.query, file_format_type)
if not self.connection_config.get('stage') and self.file_format.file_format_type == FileFormatTypes.PARQUET:
- self.logger.error("Table stages with Parquet file format is not suppported. "
+ self.logger.error("Table stages with Parquet file format is not supported. "
"Use named stages with Parquet file format or table stages with CSV files format")
sys.exit(1)
@@ -243,7 +251,7 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
raise Exception(
"Target schema name not defined in config. "
"Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines "
- "target schema for {} stream.".format(stream_name))
+ f"target schema for {stream_name} stream.")
# Define grantees
# ---------------
@@ -275,17 +283,6 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
self.upload_client = S3UploadClient(connection_config)
# Use table stage
else:
- # Enforce no parallelism with table stages.
- # The PUT command in the snowflake-python-connector is using boto3.create_client() function which is not
- # thread safe. More info at https://github.com/boto/boto3/issues/801
- if connection_config.get('parallelism') != 1:
- self.logger.warning('Enforcing to use single thread parallelism with table stages. '
- 'The PUT command in the snowflake-python-connector is using boto3 create_client() '
- 'function which is not thread safe. '
- 'If you need parallel file upload please use external stage by adding s3_bucket '
- 'key to the configuration')
- connection_config['parallelism'] = 1
-
self.upload_client = SnowflakeUploadClient(connection_config, self)
def open_connection(self):
@@ -328,7 +325,7 @@ def query(self, query: Union[str, List[str]], params: Dict = None, max_records=0
# Run every query in one transaction if query is a list of SQL
if isinstance(query, list):
- self.logger.info('Starting Transaction')
+ self.logger.debug('Starting Transaction')
cur.execute("START TRANSACTION")
queries = query
else:
@@ -342,7 +339,7 @@ def query(self, query: Union[str, List[str]], params: Dict = None, max_records=0
# update the LAST_QID
params['LAST_QID'] = qid
- self.logger.info("Running query: '%s' with Params %s", q, params)
+ self.logger.debug("Running query: '%s' with Params %s", q, params)
cur.execute(q, params)
qid = cur.sfqid
@@ -366,7 +363,7 @@ def table_name(self, stream_name, is_temporary, without_schema=False):
sf_table_name = table_name.replace('.', '_').replace('-', '_').lower()
if is_temporary:
- sf_table_name = '{}_temp'.format(sf_table_name)
+ sf_table_name = f'{sf_table_name}_temp'
if without_schema:
return f'"{sf_table_name.upper()}"'
@@ -378,13 +375,17 @@ def record_primary_key_string(self, record):
if len(self.stream_schema_message['key_properties']) == 0:
return None
flatten = flattening.flatten_record(record, self.flatten_schema, max_level=self.data_flattening_max_level)
- try:
- key_props = [str(flatten[p]) for p in self.stream_schema_message['key_properties']]
- except Exception as exc:
- self.logger.error(
- 'Cannot find %s primary key(s) in record: %s', self.stream_schema_message['key_properties'],
- flatten)
- raise exc
+
+ key_props = []
+ for key_prop in self.stream_schema_message['key_properties']:
+ if key_prop not in flatten or flatten[key_prop] is None:
+ raise PrimaryKeyNotFoundException(
+ f"Primary key '{key_prop}' does not exist in record or is null. "
+ f"Available fields: {list(flatten.keys())}"
+ )
+
+ key_props.append(str(flatten[key_prop]))
+
return ','.join(key_props)
def put_to_stage(self, file, stream, count, temp_dir=None):
@@ -394,9 +395,38 @@ def put_to_stage(self, file, stream, count, temp_dir=None):
def delete_from_stage(self, stream, s3_key):
"""Delete file from snowflake stage"""
- self.logger.info('Deleting %s from stage', format(s3_key))
self.upload_client.delete_object(stream, s3_key)
+ def copy_to_archive(self, s3_source_key, s3_archive_key, s3_archive_metadata):
+ """
+ Copy file from snowflake stage to archive.
+
+ s3_source_key: The s3 key to copy, assumed to exist in the bucket configured as 's3_bucket'
+
+ s3_archive_key: The key to use in archive destination. This will be prefixed with the config value
+ 'archive_load_files_s3_prefix'. If none is specified, 'archive' will be used as the prefix.
+
+ As destination bucket, the config value 'archive_load_files_s3_bucket' will be used. If none is
+ specified, the bucket configured as 's3_bucket' will be used.
+
+ s3_archive_metadata: This dict will be merged with any metadata in the source file.
+
+ """
+ source_bucket = self.connection_config.get('s3_bucket')
+
+ # Get archive s3_bucket from config, or use same bucket if not specified
+ archive_bucket = self.connection_config.get('archive_load_files_s3_bucket', source_bucket)
+
+ # Determine prefix to use in archive s3 bucket
+ default_archive_prefix = 'archive'
+ archive_prefix = self.connection_config.get('archive_load_files_s3_prefix', default_archive_prefix)
+ prefixed_archive_key = f'{archive_prefix}/{s3_archive_key}'
+
+ copy_source = f'{source_bucket}/{s3_source_key}'
+
+ self.logger.info('Copying %s to archive location %s', copy_source, prefixed_archive_key)
+ self.upload_client.copy_object(copy_source, archive_bucket, prefixed_archive_key, s3_archive_metadata)
+
def get_stage_name(self, stream):
"""Generate snowflake stage name"""
stage = self.connection_config.get('stage', None)
@@ -408,8 +438,7 @@ def get_stage_name(self, stream):
def load_file(self, s3_key, count, size_bytes):
"""Load a supported file type from snowflake stage into target table"""
- stream_schema_message = self.stream_schema_message
- stream = stream_schema_message['stream']
+ stream = self.stream_schema_message['stream']
self.logger.info("Loading %d rows into '%s'", count, self.table_name(stream, False))
# Get list if columns with types
@@ -422,55 +451,96 @@ def load_file(self, s3_key, count, size_bytes):
for (name, schema) in self.flatten_schema.items()
]
+ inserts = 0
+ updates = 0
+
+ # Insert or Update with MERGE command if primary key defined
+ if len(self.stream_schema_message['key_properties']) > 0:
+ try:
+ inserts, updates = self._load_file_merge(
+ s3_key=s3_key,
+ stream=stream,
+ columns_with_trans=columns_with_trans
+ )
+ except Exception as ex:
+ self.logger.error(
+ 'Error while executing MERGE query for table "%s" in stream "%s"',
+ self.table_name(stream, False), stream
+ )
+ raise ex
+
+ # Insert only with COPY command if no primary key
+ else:
+ try:
+ inserts, updates = (
+ self._load_file_copy(
+ s3_key=s3_key,
+ stream=stream,
+ columns_with_trans=columns_with_trans
+ ),
+ 0,
+ )
+ except Exception as ex:
+ self.logger.error(
+ 'Error while executing COPY query for table "%s" in stream "%s"',
+ self.table_name(stream, False), stream
+ )
+ raise ex
+
+ self.logger.info(
+ 'Loading into %s: %s',
+ self.table_name(stream, False),
+ json.dumps({'inserts': inserts, 'updates': updates, 'size_bytes': size_bytes})
+ )
+
+ def _load_file_merge(self, s3_key, stream, columns_with_trans) -> Tuple[int, int]:
+ # MERGE does insert and update
+ inserts = 0
+ updates = 0
with self.open_connection() as connection:
with connection.cursor(snowflake.connector.DictCursor) as cur:
- inserts = 0
- updates = 0
-
- # Insert or Update with MERGE command if primary key defined
- if len(self.stream_schema_message['key_properties']) > 0:
- merge_sql = self.file_format.formatter.create_merge_sql(table_name=self.table_name(stream, False),
- stage_name=self.get_stage_name(stream),
- s3_key=s3_key,
- file_format_name=
- self.connection_config['file_format'],
- columns=columns_with_trans,
- pk_merge_condition=
- self.primary_key_merge_condition())
- self.logger.debug('Running query: %s', merge_sql)
- cur.execute(merge_sql)
-
- # Get number of inserted and updated records - MERGE does insert and update
- results = cur.fetchall()
- if len(results) > 0:
- inserts = results[0].get('number of rows inserted', 0)
- updates = results[0].get('number of rows updated', 0)
-
- # Insert only with COPY command if no primary key
- else:
- copy_sql = self.file_format.formatter.create_copy_sql(table_name=self.table_name(stream, False),
- stage_name=self.get_stage_name(stream),
- s3_key=s3_key,
- file_format_name=
- self.connection_config['file_format'],
- columns=columns_with_trans)
- self.logger.debug('Running query: %s', copy_sql)
- cur.execute(copy_sql)
-
- # Get number of inserted records - COPY does insert only
- results = cur.fetchall()
- if len(results) > 0:
- inserts = results[0].get('rows_loaded', 0)
-
- self.logger.info('Loading into %s: %s',
- self.table_name(stream, False),
- json.dumps({'inserts': inserts, 'updates': updates, 'size_bytes': size_bytes}))
+ merge_sql = self.file_format.formatter.create_merge_sql(
+ table_name=self.table_name(stream, False),
+ stage_name=self.get_stage_name(stream),
+ s3_key=s3_key,
+ file_format_name=self.connection_config['file_format'],
+ columns=columns_with_trans,
+ pk_merge_condition=self.primary_key_merge_condition()
+ )
+ self.logger.debug('Running query: %s', merge_sql)
+ cur.execute(merge_sql)
+ # Get number of inserted and updated records
+ results = cur.fetchall()
+ if len(results) > 0:
+ inserts = results[0].get('number of rows inserted', 0)
+ updates = results[0].get('number of rows updated', 0)
+ return inserts, updates
+
+ def _load_file_copy(self, s3_key, stream, columns_with_trans) -> int:
+ # COPY does insert only
+ inserts = 0
+ with self.open_connection() as connection:
+ with connection.cursor(snowflake.connector.DictCursor) as cur:
+ copy_sql = self.file_format.formatter.create_copy_sql(
+ table_name=self.table_name(stream, False),
+ stage_name=self.get_stage_name(stream),
+ s3_key=s3_key,
+ file_format_name=self.connection_config['file_format'],
+ columns=columns_with_trans
+ )
+ self.logger.debug('Running query: %s', copy_sql)
+ cur.execute(copy_sql)
+ # Get number of inserted records - COPY does insert only
+ results = cur.fetchall()
+ if len(results) > 0:
+ inserts = results[0].get('rows_loaded', 0)
+ return inserts
def primary_key_merge_condition(self):
"""Generate SQL join condition on primary keys for merge SQL statements"""
stream_schema_message = self.stream_schema_message
names = primary_column_names(stream_schema_message)
- return ' AND '.join(['s.{0} = t.{0}'.format(c) for c in names])
+ return ' AND '.join([f's.{c} = t.{c}' for c in names])
def column_names(self):
"""Get list of columns in the schema"""
@@ -487,26 +557,27 @@ def create_table_query(self, is_temporary=False):
for (name, schema) in self.flatten_schema.items()
]
- primary_key = ["PRIMARY KEY ({})".format(', '.join(primary_column_names(stream_schema_message)))] \
- if len(stream_schema_message.get('key_properties', [])) > 0 else []
+ primary_key = []
+ if len(stream_schema_message.get('key_properties', [])) > 0:
+ pk_list = ', '.join(primary_column_names(stream_schema_message))
+ primary_key = [f"PRIMARY KEY({pk_list})"]
- return 'CREATE {}TABLE IF NOT EXISTS {} ({}) {}'.format(
- 'TEMP ' if is_temporary else '',
- self.table_name(stream_schema_message['stream'], is_temporary),
- ', '.join(columns + primary_key),
- 'data_retention_time_in_days = 0 ' if is_temporary else 'data_retention_time_in_days = 1 '
- )
+ p_temp = 'TEMP ' if is_temporary else ''
+ p_table_name = self.table_name(stream_schema_message['stream'], is_temporary)
+ p_columns = ', '.join(columns + primary_key)
+ p_extra = 'data_retention_time_in_days = 0 ' if is_temporary else 'data_retention_time_in_days = 1 '
+ return f'CREATE {p_temp}TABLE IF NOT EXISTS {p_table_name} ({p_columns}) {p_extra}'
def grant_usage_on_schema(self, schema_name, grantee):
"""Grant usage on schema"""
- query = "GRANT USAGE ON SCHEMA {} TO ROLE {}".format(schema_name, grantee)
+ query = f"GRANT USAGE ON SCHEMA {schema_name} TO ROLE {grantee}"
self.logger.info("Granting USAGE privilege on '%s' schema to '%s'... %s", schema_name, grantee, query)
self.query(query)
# pylint: disable=invalid-name
def grant_select_on_all_tables_in_schema(self, schema_name, grantee):
"""Grant select on all tables in schema"""
- query = "GRANT SELECT ON ALL TABLES IN SCHEMA {} TO ROLE {}".format(schema_name, grantee)
+ query = f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema_name} TO ROLE {grantee}"
self.logger.info(
"Granting SELECT ON ALL TABLES privilege on '%s' schema to '%s'... %s", schema_name, grantee, query)
self.query(query)
@@ -523,7 +594,7 @@ def grant_privilege(cls, schema, grantees, grant_method):
def delete_rows(self, stream):
"""Hard delete rows from target table"""
table = self.table_name(stream, False)
- query = "DELETE FROM {} WHERE _sdc_deleted_at IS NOT NULL".format(table)
+ query = f"DELETE FROM {table} WHERE _sdc_deleted_at IS NOT NULL"
self.logger.info("Deleting rows from '%s' table... %s", table, query)
self.logger.info('DELETE %d', len(self.query(query)))
@@ -540,7 +611,7 @@ def create_schema_if_not_exists(self):
schema_rows = self.query(f"SHOW SCHEMAS LIKE '{schema_name.upper()}'")
if len(schema_rows) == 0:
- query = "CREATE SCHEMA IF NOT EXISTS {}".format(schema_name)
+ query = f"CREATE SCHEMA IF NOT EXISTS {schema_name}"
self.logger.info("Schema '%s' does not exist. Creating... %s", schema_name, query)
self.query(query)
@@ -570,7 +641,7 @@ def get_tables(self, table_schemas=None):
# Run everything in one transaction
try:
- tables = self.query(queries, max_records=9999)
+ tables = self.query(queries, max_records=99999)
# Catch exception when schema not exists and SHOW TABLES throws a ProgrammingError
# Regexp to extract snowflake error code and message from the exception message
@@ -617,7 +688,7 @@ def get_table_columns(self, table_schemas=None):
# Run everything in one transaction
try:
- columns = self.query(queries, max_records=9999)
+ columns = self.query(queries, max_records=99999)
if not columns:
self.logger.warning('No columns discovered in the schema "%s"',
@@ -702,22 +773,23 @@ def update_columns(self):
def drop_column(self, column_name, stream):
"""Drops column from an existing table"""
- drop_column = "ALTER TABLE {} DROP COLUMN {}".format(self.table_name(stream, False), column_name)
+ drop_column = f"ALTER TABLE {self.table_name(stream, False)} DROP COLUMN {column_name}"
self.logger.info('Dropping column: %s', drop_column)
self.query(drop_column)
def version_column(self, column_name, stream):
"""Versions a column in an existing table"""
- version_column = "ALTER TABLE {} RENAME COLUMN {} TO \"{}_{}\"".format(self.table_name(stream, False),
- column_name,
- column_name.replace("\"", ""),
- time.strftime("%Y%m%d_%H%M"))
+ p_table_name = self.table_name(stream, False)
+ p_column_name = column_name.replace("\"", "")
+ p_ver_time = time.strftime("%Y%m%d_%H%M")
+
+ version_column = f"ALTER TABLE {p_table_name} RENAME COLUMN {column_name} TO \"{p_column_name}_{p_ver_time}\""
self.logger.info('Versioning column: %s', version_column)
self.query(version_column)
def add_column(self, column, stream):
"""Adds a new column to an existing table"""
- add_column = "ALTER TABLE {} ADD COLUMN {}".format(self.table_name(stream, False), column)
+ add_column = f"ALTER TABLE {self.table_name(stream, False)} ADD COLUMN {column}"
self.logger.info('Adding column: %s', add_column)
self.query(add_column)
@@ -740,7 +812,6 @@ def sync_table(self):
query = self.create_table_query()
self.logger.info('Table %s does not exist. Creating...', table_name_with_schema)
self.query(query)
-
self.grant_privilege(self.schema_name, self.grantees, self.grant_select_on_all_tables_in_schema)
# Refresh columns cache if required
@@ -749,3 +820,64 @@ def sync_table(self):
else:
self.logger.info('Table %s exists', table_name_with_schema)
self.update_columns()
+
+ self._refresh_table_pks()
+
+ def _refresh_table_pks(self):
+ """
+ Refresh table PK constraints by either dropping or adding PK based on changes to `key_properties` of the
+ stream schema.
+ The non-nullability of PK column is also dropped.
+ """
+ table_name = self.table_name(self.stream_schema_message['stream'], False)
+ current_pks = self._get_current_pks()
+ new_pks = set(pk.upper() for pk in self.stream_schema_message.get('key_properties', []))
+
+ queries = []
+
+ self.logger.debug('Table: %s, Current PKs: %s | New PKs: %s ',
+ self.stream_schema_message['stream'],
+ current_pks,
+ new_pks
+ )
+
+ if not new_pks and current_pks:
+ self.logger.info('Table "%s" currently has PK constraint, but we need to drop it.', table_name)
+ queries.append(f'alter table {table_name} drop primary key;')
+
+ elif new_pks != current_pks:
+ self.logger.info('Changes detected in pk columns of table "%s", need to refresh PK.', table_name)
+ pk_list = ', '.join([safe_column_name(col) for col in new_pks])
+
+ if current_pks:
+ queries.append(f'alter table {table_name} drop primary key;')
+
+ queries.append(f'alter table {table_name} add primary key({pk_list});')
+
+ # For now, we don't wish to enforce non-nullability on the pk columns
+ for pk in current_pks.union(new_pks):
+ queries.append(f'alter table {table_name} alter column {safe_column_name(pk)} drop not null;')
+
+ self.query(queries)
+
+ def _get_current_pks(self) -> Set[str]:
+ """
+ Finds the stream's current Pk in Snowflake.
+ Returns: Set of pk columns, in upper case. Empty means table has no PK
+ """
+ table_name = self.table_name(self.stream_schema_message['stream'], False)
+
+ show_query = f"show primary keys in table {self.connection_config['dbname']}.{table_name};"
+
+ columns = set()
+ try:
+ columns = self.query(show_query)
+
+ # Catch exception when schema not exists and SHOW TABLES throws a ProgrammingError
+ # Regexp to extract snowflake error code and message from the exception message
+ # Do nothing if schema not exists
+ except snowflake.connector.errors.ProgrammingError as exc:
+ if not re.match(r'002043 \(02000\):.*\n.*does not exist.*', str(sys.exc_info()[1])):
+ raise exc
+
+ return set(col['column_name'] for col in columns)
diff --git a/target_snowflake/exceptions.py b/target_snowflake/exceptions.py
index d8058b8b..32a39396 100644
--- a/target_snowflake/exceptions.py
+++ b/target_snowflake/exceptions.py
@@ -23,3 +23,11 @@ class FileFormatNotFoundException(Exception):
class InvalidFileFormatException(Exception):
"""Exception to raise when name file format is not compatible"""
+
+
+class UnexpectedMessageTypeException(Exception):
+ """Exception to raise when provided message doesn't match the expected type"""
+
+
+class PrimaryKeyNotFoundException(Exception):
+ """Exception to raise when primary key not found in the record message"""
diff --git a/target_snowflake/file_formats/csv.py b/target_snowflake/file_formats/csv.py
index 189905e1..42d76ee4 100644
--- a/target_snowflake/file_formats/csv.py
+++ b/target_snowflake/file_formats/csv.py
@@ -6,7 +6,7 @@
from typing import Callable, Dict, List
from tempfile import mkstemp
-import target_snowflake.flattening as flattening
+from target_snowflake import flattening
def create_copy_sql(table_name: str,
@@ -15,14 +15,11 @@ def create_copy_sql(table_name: str,
file_format_name: str,
columns: List):
"""Generate a CSV compatible snowflake COPY INTO command"""
- return "COPY INTO {} ({}) " \
- "FROM '@{}/{}' " \
- "FILE_FORMAT = (format_name='{}')".format(
- table_name,
- ', '.join([c['name'] for c in columns]),
- stage_name,
- s3_key,
- file_format_name)
+ p_columns = ', '.join([c['name'] for c in columns])
+
+ return f"COPY INTO {table_name} ({p_columns}) " \
+ f"FROM '@{stage_name}/{s3_key}' " \
+ f"FILE_FORMAT = (format_name='{file_format_name}')"
def create_merge_sql(table_name: str,
@@ -32,24 +29,20 @@ def create_merge_sql(table_name: str,
columns: List,
pk_merge_condition: str) -> str:
"""Generate a CSV compatible snowflake MERGE INTO command"""
- return "MERGE INTO {} t USING (" \
- "SELECT {} " \
- "FROM '@{}/{}' " \
- "(FILE_FORMAT => '{}')) s " \
- "ON {} " \
- "WHEN MATCHED THEN UPDATE SET {} " \
+ p_source_columns = ', '.join([f"{c['trans']}(${i + 1}) {c['name']}" for i, c in enumerate(columns)])
+ p_update = ', '.join([f"{c['name']}=s.{c['name']}" for c in columns])
+ p_insert_cols = ', '.join([c['name'] for c in columns])
+ p_insert_values = ', '.join([f"s.{c['name']}" for c in columns])
+
+ return f"MERGE INTO {table_name} t USING (" \
+ f"SELECT {p_source_columns} " \
+ f"FROM '@{stage_name}/{s3_key}' " \
+ f"(FILE_FORMAT => '{file_format_name}')) s " \
+ f"ON {pk_merge_condition} " \
+ f"WHEN MATCHED THEN UPDATE SET {p_update} " \
"WHEN NOT MATCHED THEN " \
- "INSERT ({}) " \
- "VALUES ({})".format(
- table_name,
- ', '.join(["{}(${}) {}".format(c['trans'], i + 1, c['name']) for i, c in enumerate(columns)]),
- stage_name,
- s3_key,
- file_format_name,
- pk_merge_condition,
- ', '.join(['{0}=s.{0}'.format(c['name']) for c in columns]),
- ', '.join([c['name'] for c in columns]),
- ', '.join(['s.{}'.format(c['name']) for c in columns]))
+ f"INSERT ({p_insert_cols}) " \
+ f"VALUES ({p_insert_values})"
def record_to_csv_line(record: dict,
diff --git a/target_snowflake/file_formats/parquet.py b/target_snowflake/file_formats/parquet.py
index b16fa462..ad02e6a5 100644
--- a/target_snowflake/file_formats/parquet.py
+++ b/target_snowflake/file_formats/parquet.py
@@ -5,7 +5,7 @@
from typing import Dict, List
from tempfile import mkstemp
-import target_snowflake.flattening as flattening
+from target_snowflake import flattening
def create_copy_sql(table_name: str,
@@ -14,18 +14,13 @@ def create_copy_sql(table_name: str,
file_format_name: str,
columns: List):
"""Generate a Parquet compatible snowflake COPY INTO command"""
- return "COPY INTO {} ({}) " \
- "FROM (SELECT {} FROM '@{}/{}') " \
- "FILE_FORMAT = (format_name='{}')".format(
- table_name,
- ', '.join([c['name'] for c in columns]),
- ', '.join(["{}($1:{}) {}".format(c['trans'],
- c['json_element_name'],
- c['name'])
- for i, c in enumerate(columns)]),
- stage_name,
- s3_key,
- file_format_name)
+ p_target_columns = ', '.join([c['name'] for c in columns])
+ p_source_columns = ', '.join([f"{c['trans']}($1:{c['json_element_name']}) {c['name']}"
+ for i, c in enumerate(columns)])
+
+ return f"COPY INTO {table_name} ({p_target_columns}) " \
+ f"FROM (SELECT {p_source_columns} FROM '@{stage_name}/{s3_key}') " \
+ f"FILE_FORMAT = (format_name='{file_format_name}')"
def create_merge_sql(table_name: str,
@@ -35,27 +30,21 @@ def create_merge_sql(table_name: str,
columns: List,
pk_merge_condition: str) -> str:
"""Generate a Parquet compatible snowflake MERGE INTO command"""
- return "MERGE INTO {} t USING (" \
- "SELECT {} " \
- "FROM '@{}/{}' " \
- "(FILE_FORMAT => '{}')) s " \
- "ON {} " \
- "WHEN MATCHED THEN UPDATE SET {} " \
+ p_source_columns = ', '.join([f"{c['trans']}($1:{c['json_element_name']}) {c['name']}"
+ for i, c in enumerate(columns)])
+ p_update = ', '.join([f"{c['name']}=s.{c['name']}" for c in columns])
+ p_insert_cols = ', '.join([c['name'] for c in columns])
+ p_insert_values = ', '.join([f"s.{c['name']}" for c in columns])
+
+ return f"MERGE INTO {table_name} t USING (" \
+ f"SELECT {p_source_columns} " \
+ f"FROM '@{stage_name}/{s3_key}' " \
+ f"(FILE_FORMAT => '{file_format_name}')) s " \
+ f"ON {pk_merge_condition} " \
+ f"WHEN MATCHED THEN UPDATE SET {p_update} " \
"WHEN NOT MATCHED THEN " \
- "INSERT ({}) " \
- "VALUES ({})".format(
- table_name,
- ', '.join(["{}($1:{}) {}".format(c['trans'],
- c['json_element_name'],
- c['name'])
- for i, c in enumerate(columns)]),
- stage_name,
- s3_key,
- file_format_name,
- pk_merge_condition,
- ', '.join(['{0}=s.{0}'.format(c['name']) for c in columns]),
- ', '.join([c['name'] for c in columns]),
- ', '.join(['s.{}'.format(c['name']) for c in columns]))
+ f"INSERT ({p_insert_cols}) " \
+ f"VALUES ({p_insert_values})"
def records_to_dataframe(records: Dict,
diff --git a/target_snowflake/flattening.py b/target_snowflake/flattening.py
index 33cc1cde..a6536ecc 100644
--- a/target_snowflake/flattening.py
+++ b/target_snowflake/flattening.py
@@ -52,23 +52,17 @@ def flatten_schema(d, parent_key=None, sep='__', level=0, max_level=0):
items.extend(flatten_schema(v, parent_key + [k], sep=sep, level=level + 1, max_level=max_level).items())
else:
items.append((new_key, v))
- else:
- if len(v.values()) > 0:
- if list(v.values())[0][0]['type'] == 'string':
- list(v.values())[0][0]['type'] = ['null', 'string']
- items.append((new_key, list(v.values())[0][0]))
- elif list(v.values())[0][0]['type'] == 'array':
- list(v.values())[0][0]['type'] = ['null', 'array']
- items.append((new_key, list(v.values())[0][0]))
- elif list(v.values())[0][0]['type'] == 'object':
- list(v.values())[0][0]['type'] = ['null', 'object']
- items.append((new_key, list(v.values())[0][0]))
+ elif len(v.values()) > 0:
+ value_type = list(v.values())[0][0]['type']
+ if value_type in ['string', 'array', 'object']:
+ list(v.values())[0][0]['type'] = ['null', value_type]
+ items.append((new_key, list(v.values())[0][0]))
key_func = lambda item: item[0]
sorted_items = sorted(items, key=key_func)
for k, g in itertools.groupby(sorted_items, key=key_func):
if len(list(g)) > 1:
- raise ValueError('Duplicate column name produced in schema: {}'.format(k))
+ raise ValueError(f'Duplicate column name produced in schema: {k}')
return dict(sorted_items)
diff --git a/target_snowflake/stream_utils.py b/target_snowflake/stream_utils.py
index d7e043fe..5698fd26 100644
--- a/target_snowflake/stream_utils.py
+++ b/target_snowflake/stream_utils.py
@@ -8,6 +8,7 @@
from singer import get_logger
from target_snowflake.exceptions import UnexpectedValueTypeException
+from target_snowflake.exceptions import UnexpectedMessageTypeException
LOGGER = get_logger('target_snowflake')
@@ -115,3 +116,16 @@ def stream_name_to_dict(stream_name, separator='-'):
'schema_name': schema_name,
'table_name': table_name
}
+
+
+def get_incremental_key(singer_msg: Dict):
+ """Derive incremental key from a Singer message dictionary"""
+ if singer_msg['type'] != "SCHEMA":
+ raise UnexpectedMessageTypeException(f"Expecting type SCHEMA, got {singer_msg['type']}")
+
+ if 'bookmark_properties' in singer_msg and len(singer_msg['bookmark_properties']) > 0:
+ col = singer_msg['bookmark_properties'][0]
+ if col in singer_msg['schema']['properties']:
+ return col
+
+ return None
diff --git a/target_snowflake/upload_clients/base_upload_client.py b/target_snowflake/upload_clients/base_upload_client.py
index 11717450..74dc0d4b 100644
--- a/target_snowflake/upload_clients/base_upload_client.py
+++ b/target_snowflake/upload_clients/base_upload_client.py
@@ -24,3 +24,9 @@ def delete_object(self, stream: str, key: str) -> None:
"""
Delete object
"""
+
+ @abstractmethod
+ def copy_object(self, copy_source: str, target_bucket: str, target_key: str, target_metadata: dict) -> None:
+ """
+ Copy object
+ """
diff --git a/target_snowflake/upload_clients/s3_upload_client.py b/target_snowflake/upload_clients/s3_upload_client.py
index 8802107c..e95ac566 100644
--- a/target_snowflake/upload_clients/s3_upload_client.py
+++ b/target_snowflake/upload_clients/s3_upload_client.py
@@ -6,7 +6,7 @@
import datetime
from snowflake.connector.encryption_util import SnowflakeEncryptionUtil
-from snowflake.connector.remote_storage_util import SnowflakeFileEncryptionMaterial
+from snowflake.connector.storage_client import SnowflakeFileEncryptionMaterial
from .base_upload_client import BaseUploadClient
@@ -50,11 +50,9 @@ def upload_file(self, file, stream, temp_dir=None):
bucket = self.connection_config['s3_bucket']
s3_acl = self.connection_config.get('s3_acl')
s3_key_prefix = self.connection_config.get('s3_key_prefix', '')
- s3_key = "{}pipelinewise_{}_{}_{}".format(s3_key_prefix,
- stream,
- datetime.datetime.now().strftime("%Y%m%d-%H%M%S-%f"),
- os.path.basename(file))
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S-%f")
+ s3_key = f"{s3_key_prefix}pipelinewise_{stream}_{timestamp}_{os.path.basename(file)}"
self.logger.info('Target S3 bucket: %s, local file: %s, S3 key: %s', bucket, file, s3_key)
# Encrypt csv if client side encryption enabled
@@ -73,7 +71,7 @@ def upload_file(self, file, stream, temp_dir=None):
)
# Upload to s3
- extra_args = {'ACL': s3_acl} if s3_acl else dict()
+ extra_args = {'ACL': s3_acl} if s3_acl else {}
# Send key and iv in the metadata, that will be required to decrypt and upload the encrypted file
extra_args['Metadata'] = {
@@ -97,3 +95,13 @@ def delete_object(self, stream: str, key: str) -> None:
self.logger.info('Deleting %s from external snowflake stage on S3', key)
bucket = self.connection_config['s3_bucket']
self.s3_client.delete_object(Bucket=bucket, Key=key)
+
+ def copy_object(self, copy_source: str, target_bucket: str, target_key: str, target_metadata: dict) -> None:
+ """Copy object to another location on S3"""
+ self.logger.info('Copying %s to %s/%s', copy_source, target_bucket, target_key)
+ source_bucket, source_key = copy_source.split("/", 1)
+ metadata = self.s3_client.head_object(Bucket=source_bucket, Key=source_key).get('Metadata', {})
+ metadata.update(target_metadata)
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.copy_object
+ self.s3_client.copy_object(CopySource=copy_source, Bucket=target_bucket, Key=target_key,
+ Metadata=metadata, MetadataDirective="REPLACE")
diff --git a/target_snowflake/upload_clients/snowflake_upload_client.py b/target_snowflake/upload_clients/snowflake_upload_client.py
index 5366ae82..67d4a9b8 100644
--- a/target_snowflake/upload_clients/snowflake_upload_client.py
+++ b/target_snowflake/upload_clients/snowflake_upload_client.py
@@ -38,3 +38,7 @@ def delete_object(self, stream: str, key: str) -> None:
with self.dblink.open_connection() as connection:
connection.cursor().execute(f"REMOVE '@{stage}/{key}'")
+
+ def copy_object(self, copy_source: str, target_bucket: str, target_key: str, target_metadata: dict) -> None:
+ raise NotImplementedError(
+ "Copying objects is not supported with a Snowflake upload client.")
diff --git a/tests/integration/.env.sample b/tests/integration/.env.sample
index bb97ed1f..98c55b45 100644
--- a/tests/integration/.env.sample
+++ b/tests/integration/.env.sample
@@ -13,4 +13,3 @@ TARGET_SNOWFLAKE_STAGE=
TARGET_SNOWFLAKE_FILE_FORMAT_CSV=
TARGET_SNOWFLAKE_FILE_FORMAT_PARQUET=
CLIENT_SIDE_ENCRYPTION_MASTER_KEY=
-CLIENT_SIDE_ENCRYPTION_STAGE_OBJECT=
diff --git a/tests/integration/resources/messages-with-changing-pk.json b/tests/integration/resources/messages-with-changing-pk.json
new file mode 100644
index 00000000..2a3c4c0a
--- /dev/null
+++ b/tests/integration/resources/messages-with-changing-pk.json
@@ -0,0 +1,13 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 1, "version": 1}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 1, "results": "xyz1", "time_created": "2019-12-01T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 2, "results": "xyz2", "time_created": "2019-12-03T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 2, "version": 1}}}}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 3, "results": "xyz3", "time_created": "2019-12-09T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 4, "results": "xyz4", "time_created": "2019-12-11T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 4, "version": 1}}}}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "name": {"type": "string"}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id", "name"]}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 5, "name": "A", "results": "xyz5", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 6, "name": "B", "results": "xyz6", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 6, "version": 1}}}}
\ No newline at end of file
diff --git a/tests/integration/resources/messages-with-falsy-pk-values.json b/tests/integration/resources/messages-with-falsy-pk-values.json
new file mode 100644
index 00000000..edd665b8
--- /dev/null
+++ b/tests/integration/resources/messages-with-falsy-pk-values.json
@@ -0,0 +1,13 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 1, "version": 1}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 0, "results": "xyz1", "time_created": "2019-12-01T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 1, "results": "xyz1", "time_created": "2019-12-01T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 2, "results": "xyz2", "time_created": "2019-12-03T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "c_bool": {"type": ["boolean"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id", "c_bool"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 2, "version": 1}}}}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 3, "c_bool": false, "results": "xyz3", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 4, "c_bool": true, "results": "xyz4", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 0, "c_bool": true, "results": "xyz4", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 5, "c_bool": false, "results": "xyz4", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 0, "c_bool": false, "results": "xyz4", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
\ No newline at end of file
diff --git a/tests/integration/resources/messages-with-multi-schemas.json b/tests/integration/resources/messages-with-multi-schemas.json
index 993e2587..9f3cc6fd 100644
--- a/tests/integration/resources/messages-with-multi-schemas.json
+++ b/tests/integration/resources/messages-with-multi-schemas.json
@@ -6,10 +6,10 @@
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}}}}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_two": {"initial_full_table_complete": true}}}}
-{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "c_iso_date": {"format": "date", "inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_two", "version": 3}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 1, "c_varchar": "1", "c_int": 1, "c_date": "2019-02-01 15:12:45", "_sdc_deleted_at": "2019-02-12T01:10:10.000000Z"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-10 02:00:00"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 1, "c_varchar": "1", "c_int": 1, "c_date": "2019-02-01 15:12:45", "c_iso_date": "2019-02-01", "_sdc_deleted_at": "2019-02-12T01:10:10.000000Z"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-10 02:00:00", "c_iso_date": "2019-02-10"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_wo": {"initial_full_table_complete": true}}}}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_three", "version": 3}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}, "tap_mysql_test-test_table_three": {"initial_full_table_complete": true}}}}
diff --git a/tests/integration/resources/messages-with-new-pk.json b/tests/integration/resources/messages-with-new-pk.json
new file mode 100644
index 00000000..9fb431d3
--- /dev/null
+++ b/tests/integration/resources/messages-with-new-pk.json
@@ -0,0 +1,13 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": []}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 1, "version": 1}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 1, "results": "xyz1", "time_created": "2019-12-01T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 2, "results": "xyz2", "time_created": "2019-12-03T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 2, "version": 1}}}}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 3, "results": "xyz3", "time_created": "2019-12-09T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 4, "results": "xyz4", "time_created": "2019-12-11T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 4, "version": 1}}}}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "name": {"type": "string"}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id", "name"]}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 5, "name": "A", "results": "xyz5", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 6, "name": "B", "results": "xyz6", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 6, "version": 1}}}}
\ No newline at end of file
diff --git a/tests/integration/resources/messages-with-null-pk.json b/tests/integration/resources/messages-with-null-pk.json
new file mode 100644
index 00000000..0a753966
--- /dev/null
+++ b/tests/integration/resources/messages-with-null-pk.json
@@ -0,0 +1,8 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 1, "version": 1}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 1, "results": "xyz1", "time_created": "2019-12-01T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 2, "results": "xyz2", "time_created": "2019-12-03T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 2, "version": 1}}}}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": null, "results": "xyz3", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": null, "results": "xyz4", "time_created": "2019-12-17T19:12:12.006049Z"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
\ No newline at end of file
diff --git a/tests/integration/resources/messages-with-three-streams-modified-column.json b/tests/integration/resources/messages-with-three-streams-modified-column.json
index f0acee87..4867b631 100644
--- a/tests/integration/resources/messages-with-three-streams-modified-column.json
+++ b/tests/integration/resources/messages-with-three-streams-modified-column.json
@@ -6,10 +6,10 @@
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}}}}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_two": {"initial_full_table_complete": true}}}}
-{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"inclusion": "available", "type": ["null", "string"]}, "c_new_column": {"inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"inclusion": "available", "type": ["null", "string"]}, "c_iso_date": {"format": "date", "inclusion": "available", "type": ["null", "string"]}, "c_new_column": {"inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_two", "version": 3}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-12 02:00:00", "c_new_column": "data 1"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 3, "c_varchar": "2", "c_int": 3, "c_date": "2019-02-15 02:00:00", "c_new_column": "data 2"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-12 02:00:00", "c_iso_date": "2019-02-10", "c_new_column": "data 1"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 3, "c_varchar": "2", "c_int": 3, "c_date": "2019-02-15 02:00:00", "c_iso_date": "2019-02-15", "c_new_column": "data 2"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_wo": {"initial_full_table_complete": true}}}}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_three", "version": 3}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}, "tap_mysql_test-test_table_three": {"initial_full_table_complete": true}}}}
diff --git a/tests/integration/resources/messages-with-three-streams.json b/tests/integration/resources/messages-with-three-streams.json
index 975cc0f3..75b49eba 100644
--- a/tests/integration/resources/messages-with-three-streams.json
+++ b/tests/integration/resources/messages-with-three-streams.json
@@ -6,10 +6,10 @@
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}}}}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_two": {"initial_full_table_complete": true}}}}
-{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_table_two", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_date": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "c_iso_date": {"format": "date", "inclusion": "available", "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_two", "version": 3}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 1, "c_varchar": "1", "c_int": 1, "c_date": "2019-02-01 15:12:45", "_sdc_deleted_at": "2019-02-12T01:10:10.000000Z"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
-{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-10 02:00:00"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 1, "c_varchar": "1", "c_int": 1, "c_date": "2019-02-01 15:12:45", "c_iso_date": "2019-02-01", "_sdc_deleted_at": "2019-02-12T01:10:10.000000Z"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_table_two", "record": {"c_pk": 2, "c_varchar": "2", "c_int": 2, "c_date": "2019-02-10 02:00:00", "c_iso_date": "2019-02-10"}, "version": 3, "time_extracted": "2019-01-31T15:51:48.861962Z"}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_wo": {"initial_full_table_complete": true}}}}
{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_three", "version": 3}
{"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_two", "bookmarks": {"tap_mysql_test-test_table_one": {"initial_full_table_complete": true}, "tap_mysql_test-test_table_three": {"initial_full_table_complete": true}}}}
diff --git a/tests/integration/test_target_snowflake.py b/tests/integration/test_target_snowflake.py
index e229a243..791acda2 100644
--- a/tests/integration/test_target_snowflake.py
+++ b/tests/integration/test_target_snowflake.py
@@ -1,16 +1,19 @@
import datetime
+import gzip
import json
+import tempfile
import unittest
-import mock
import os
import botocore
-import itertools
-
+import boto3
import target_snowflake
+
from target_snowflake import RecordValidationException
+from target_snowflake.exceptions import PrimaryKeyNotFoundException
from target_snowflake.db_sync import DbSync
from target_snowflake.upload_clients.s3_upload_client import S3UploadClient
+from unittest import mock
from pyarrow.lib import ArrowTypeError
from snowflake.connector.errors import ProgrammingError
from snowflake.connector.errors import DatabaseError
@@ -35,11 +38,29 @@ class TestIntegration(unittest.TestCase):
def setUp(self):
self.config = test_utils.get_test_config()
- snowflake = DbSync(self.config)
+ self.snowflake = DbSync(self.config)
# Drop target schema
if self.config['default_target_schema']:
- snowflake.query("DROP SCHEMA IF EXISTS {}".format(self.config['default_target_schema']))
+ self.snowflake.query("DROP SCHEMA IF EXISTS {}".format(self.config['default_target_schema']))
+
+ if self.config['schema_mapping']:
+ for _, val in self.config['schema_mapping'].items():
+ self.snowflake.query('drop schema if exists {}'.format(val['target_schema']))
+
+ # Set up S3 client
+ aws_access_key_id = self.config.get('aws_access_key_id')
+ aws_secret_access_key = self.config.get('aws_secret_access_key')
+ aws_session_token = self.config.get('aws_session_token')
+ aws_session = boto3.session.Session(
+ aws_access_key_id=aws_access_key_id,
+ aws_secret_access_key=aws_secret_access_key,
+ aws_session_token=aws_session_token
+ )
+
+ self.s3_client = aws_session.client('s3',
+ region_name=self.config.get('s3_region_name'),
+ endpoint_url=self.config.get('s3_endpoint_url'))
def persist_lines(self, lines):
"""Loads singer messages into snowflake without table caching option"""
@@ -121,17 +142,17 @@ def assert_three_streams_are_into_snowflake(self, should_metadata_columns_exist=
self.remove_metadata_columns_from_rows(table_one), expected_table_one)
# ----------------------------------------------------------------------
- # Check rows in table_tow
+ # Check rows in table_two
# ----------------------------------------------------------------------
expected_table_two = []
if not should_hard_deleted_rows:
expected_table_two = [
- {'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': '1', 'C_DATE': datetime.datetime(2019, 2, 1, 15, 12, 45)},
- {'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2', 'C_DATE': datetime.datetime(2019, 2, 10, 2, 0, 0)}
+ {'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': '1', 'C_DATE': datetime.datetime(2019, 2, 1, 15, 12, 45), 'C_ISO_DATE':datetime.date(2019, 2, 1)},
+ {'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2', 'C_DATE': datetime.datetime(2019, 2, 10, 2, 0, 0), 'C_ISO_DATE':datetime.date(2019, 2, 10)}
]
else:
expected_table_two = [
- {'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2', 'C_DATE': datetime.datetime(2019, 2, 10, 2, 0, 0)}
+ {'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2', 'C_DATE': datetime.datetime(2019, 2, 10, 2, 0, 0), 'C_ISO_DATE':datetime.date(2019, 2, 10)}
]
self.assertEqual(
@@ -188,7 +209,7 @@ def assert_logical_streams_are_in_snowflake(self, should_metadata_columns_exist=
]
# ----------------------------------------------------------------------
- # Check rows in table_tow
+ # Check rows in table_two
# ----------------------------------------------------------------------
expected_table_two = [
{'CID': 1, 'CVARCHAR': "updated row"},
@@ -583,11 +604,11 @@ def test_column_name_change(self):
table_two,
[
{previous_column_name: datetime.datetime(2019, 2, 1, 15, 12, 45), 'C_INT': 1, 'C_PK': 1,
- 'C_VARCHAR': '1', 'C_DATE': None, 'C_NEW_COLUMN': None},
+ 'C_VARCHAR': '1', 'C_DATE': None, 'C_ISO_DATE': datetime.date(2019, 2, 1), 'C_NEW_COLUMN': None},
{previous_column_name: datetime.datetime(2019, 2, 10, 2), 'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2',
- 'C_DATE': '2019-02-12 02:00:00', 'C_NEW_COLUMN': 'data 1'},
+ 'C_DATE': '2019-02-12 02:00:00', 'C_ISO_DATE': datetime.date(2019, 2, 10), 'C_NEW_COLUMN': 'data 1'},
{previous_column_name: None, 'C_INT': 3, 'C_PK': 3, 'C_VARCHAR': '2', 'C_DATE': '2019-02-15 02:00:00',
- 'C_NEW_COLUMN': 'data 2'}
+ 'C_ISO_DATE': datetime.date(2019, 2, 15), 'C_NEW_COLUMN': 'data 2'}
]
)
@@ -644,12 +665,12 @@ def test_column_name_change_without_table_cache(self):
table_two,
[
{previous_column_name: datetime.datetime(2019, 2, 1, 15, 12, 45), 'C_INT': 1, 'C_PK': 1,
- 'C_VARCHAR': '1', 'C_DATE': None, 'C_NEW_COLUMN': None},
+ 'C_VARCHAR': '1', 'C_DATE': None, 'C_ISO_DATE': datetime.date(2019, 2, 1), 'C_NEW_COLUMN': None},
{previous_column_name: datetime.datetime(2019, 2, 10, 2), 'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2',
- 'C_DATE': '2019-02-12 02:00:00', 'C_NEW_COLUMN': 'data 1'},
+ 'C_DATE': '2019-02-12 02:00:00', 'C_ISO_DATE': datetime.date(2019, 2, 10), 'C_NEW_COLUMN': 'data 1'},
{previous_column_name: None, 'C_INT': 3, 'C_PK': 3, 'C_VARCHAR': '2', 'C_DATE': '2019-02-15 02:00:00',
- 'C_NEW_COLUMN': 'data 2'}
- ]
+ 'C_ISO_DATE': datetime.date(2019, 2, 15), 'C_NEW_COLUMN': 'data 2'}
+ ]
)
# Table three should have a renamed columns and a new column
@@ -743,58 +764,88 @@ def test_flush_streams_with_intermediate_flushes(self, mock_emit_state):
[
# Flush #1 - Flushed edgydata until lsn: 108197216
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
# Flush #2 - Flushed logical1-logical1_table2 until lsn: 108201336
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
# Flush #3 - Flushed logical1-logical1_table2 until lsn: 108237600
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
# Flush #4 - Flushed logical1-logical1_table2 until lsn: 108238768
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
# Flush #5 - Flushed logical1-logical1_table2 until lsn: 108239704,
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
# Flush #6 - Last flush, update every stream lsn: 108240872,
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
+ "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
+ "public2-wearehere": {}}}),
])
# Every table should be loaded correctly
@@ -818,56 +869,86 @@ def test_flush_streams_with_intermediate_flushes_on_all_streams(self, mock_emit_
[
# Flush #1 - Flush every stream until lsn: 108197216
mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
# Flush #2 - Flush every stream until lsn 108201336
mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108201336,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
# Flush #3 - Flush every stream until lsn: 108237600
mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108237600,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
# Flush #4 - Flush every stream until lsn: 108238768
mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108238768,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
# Flush #5 - Flush every stream until lsn: 108239704,
mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108239896,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
# Flush #6 - Last flush, update every stream until lsn: 108240872,
mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
+ "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723596, "xmin": None},
+ "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723618, "xmin": None},
+ "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723635, "xmin": None},
+ "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872,
+ "version": 1570922723651, "xmin": None},
+ "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id",
+ "version": 1570922723667, "replication_key_value": 4079},
"public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
"public2-wearehere": {}}}),
])
@@ -1086,18 +1167,18 @@ def test_query_tagging(self):
self.assertEqual(result, [{
'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}..',
'QUERIES': 4
- },
+ },
{
- 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_ONE',
- 'QUERIES': 7
+ 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_ONE',
+ 'QUERIES': 10
},
{
- 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_THREE',
- 'QUERIES': 6
+ 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_THREE',
+ 'QUERIES': 9
},
{
- 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_TWO',
- 'QUERIES': 6
+ 'QUERY_TAG': f'PPW test tap run at {current_time}. Loading into {target_db}.{target_schema}.TEST_TABLE_TWO',
+ 'QUERIES': 9
}
])
@@ -1108,7 +1189,7 @@ def test_query_tagging(self):
AND query_text like 'SHOW FILE FORMATS%%'""")
self.assertEqual(result, [{
'SHOW_FILE_FORMAT_QUERIES': 1
- }
+ }
])
def test_table_stage(self):
@@ -1159,3 +1240,137 @@ def test_parquet(self):
# Check if data loaded correctly and metadata columns exist
self.assert_three_streams_are_into_snowflake()
+
+ def test_archive_load_files(self):
+ """Test if load file is copied to archive folder"""
+ self.config['archive_load_files'] = True
+ self.config['archive_load_files_s3_prefix'] = 'archive_folder'
+ self.config['tap_id'] = 'test_tap_id'
+ self.config['client_side_encryption_master_key'] = ''
+
+ s3_bucket = self.config['s3_bucket']
+
+ # Delete any dangling files from archive
+ files_in_s3_archive = self.s3_client.list_objects(
+ Bucket=s3_bucket, Prefix="archive_folder/test_tap_id/").get('Contents', [])
+ for file_in_archive in files_in_s3_archive:
+ key = file_in_archive["Key"]
+ self.s3_client.delete_object(Bucket=s3_bucket, Key=key)
+
+ tap_lines = test_utils.get_test_tap_lines('messages-simple-table.json')
+ self.persist_lines_with_cache(tap_lines)
+
+ # Verify expected file metadata in S3
+ files_in_s3_archive = self.s3_client.list_objects(Bucket=s3_bucket, Prefix="archive_folder/test_tap_id/").get(
+ 'Contents')
+ self.assertIsNotNone(files_in_s3_archive)
+ self.assertEqual(1, len(files_in_s3_archive))
+
+ archived_file_key = files_in_s3_archive[0]['Key']
+ archive_metadata = self.s3_client.head_object(Bucket=s3_bucket, Key=archived_file_key)['Metadata']
+ self.assertEqual({
+ 'tap': 'test_tap_id',
+ 'schema': 'tap_mysql_test',
+ 'table': 'test_simple_table',
+ 'archived-by': 'pipelinewise_target_snowflake',
+ 'incremental-key': 'id',
+ 'incremental-key-min': '1',
+ 'incremental-key-max': '5'
+ }, archive_metadata)
+
+ # Verify expected file contents
+ tmpfile = tempfile.NamedTemporaryFile()
+ with open(tmpfile.name, 'wb') as f:
+ self.s3_client.download_fileobj(s3_bucket, archived_file_key, f)
+
+ lines = []
+ with gzip.open(tmpfile, "rt") as gzipfile:
+ for line in gzipfile.readlines():
+ lines.append(line)
+
+ self.assertEqual(''.join(lines), '''1,"xyz1","not-formatted-time-1"
+2,"xyz2","not-formatted-time-2"
+3,"xyz3","not-formatted-time-3"
+4,"xyz4","not-formatted-time-4"
+5,"xyz5","not-formatted-time-5"
+''')
+
+ def test_stream_with_changing_pks_should_succeed(self):
+ """Test if table will have its PKs adjusted according to changes in schema key-properties"""
+ tap_lines = test_utils.get_test_tap_lines('messages-with-changing-pk.json')
+
+ self.persist_lines_with_cache(tap_lines)
+
+ table_desc = self.snowflake.query(f'desc table {self.config["default_target_schema"]}.test_simple_table;')
+ rows_count = self.snowflake.query(f'select count(1) as _count from'
+ f' {self.config["default_target_schema"]}.test_simple_table;')
+
+ self.assertEqual(6, rows_count[0]['_COUNT'])
+
+ self.assertEqual(4, len(table_desc))
+
+ self.assertEqual('ID', table_desc[0]['name'])
+ self.assertEqual('Y', table_desc[0]['null?'])
+ self.assertEqual('Y', table_desc[0]['primary key'])
+
+ self.assertEqual('RESULTS', table_desc[1]['name'])
+ self.assertEqual('Y', table_desc[1]['null?'])
+ self.assertEqual('N', table_desc[1]['primary key'])
+
+ self.assertEqual('TIME_CREATED', table_desc[2]['name'])
+ self.assertEqual('Y', table_desc[2]['null?'])
+ self.assertEqual('N', table_desc[2]['primary key'])
+
+ self.assertEqual('NAME', table_desc[3]['name'])
+ self.assertEqual('Y', table_desc[3]['null?'])
+ self.assertEqual('Y', table_desc[3]['primary key'])
+
+ def test_stream_with_null_values_in_pks_should_fail(self):
+ """Test if null values in PK column should abort the process"""
+ tap_lines = test_utils.get_test_tap_lines('messages-with-null-pk.json')
+
+ with self.assertRaises(PrimaryKeyNotFoundException):
+ self.persist_lines_with_cache(tap_lines)
+
+ def test_stream_with_new_pks_should_succeed(self):
+ """Test if table will have new PKs after not having any"""
+ tap_lines = test_utils.get_test_tap_lines('messages-with-new-pk.json')
+
+ self.config['primary_key_required'] = False
+
+ self.persist_lines_with_cache(tap_lines)
+
+ table_desc = self.snowflake.query(f'desc table {self.config["default_target_schema"]}.test_simple_table;')
+ rows_count = self.snowflake.query(f'select count(1) as _count from'
+ f' {self.config["default_target_schema"]}.test_simple_table;')
+
+ self.assertEqual(6, rows_count[0]['_COUNT'])
+
+ self.assertEqual(4, len(table_desc))
+
+ self.assertEqual('ID', table_desc[0]['name'])
+ self.assertEqual('Y', table_desc[0]['null?'])
+ self.assertEqual('Y', table_desc[0]['primary key'])
+
+ self.assertEqual('RESULTS', table_desc[1]['name'])
+ self.assertEqual('Y', table_desc[1]['null?'])
+ self.assertEqual('N', table_desc[1]['primary key'])
+
+ self.assertEqual('TIME_CREATED', table_desc[2]['name'])
+ self.assertEqual('Y', table_desc[2]['null?'])
+ self.assertEqual('N', table_desc[2]['primary key'])
+
+ self.assertEqual('NAME', table_desc[3]['name'])
+ self.assertEqual('Y', table_desc[3]['null?'])
+ self.assertEqual('Y', table_desc[3]['primary key'])
+
+ def test_stream_with_falsy_pks_should_succeed(self):
+ """Test if data will be loaded if records have falsy values"""
+ tap_lines = test_utils.get_test_tap_lines('messages-with-falsy-pk-values.json')
+
+ self.persist_lines_with_cache(tap_lines)
+
+ rows_count = self.snowflake.query(f'select count(1) as _count from'
+ f' {self.config["default_target_schema"]}.test_simple_table;')
+
+ self.assertEqual(8, rows_count[0]['_COUNT'])
diff --git a/tests/unit/resources/messages-simple-table.json b/tests/unit/resources/messages-simple-table.json
new file mode 100644
index 00000000..5b270ded
--- /dev/null
+++ b/tests/unit/resources/messages-simple-table.json
@@ -0,0 +1,8 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"definitions": {"sdc_recursive_boolean_array": {"items": {"$ref": "#/definitions/sdc_recursive_boolean_array"}, "type": ["null", "boolean", "array"]}, "sdc_recursive_integer_array": {"items": {"$ref": "#/definitions/sdc_recursive_integer_array"}, "type": ["null", "integer", "array"]}, "sdc_recursive_number_array": {"items": {"$ref": "#/definitions/sdc_recursive_number_array"}, "type": ["null", "number", "array"]}, "sdc_recursive_object_array": {"items": {"$ref": "#/definitions/sdc_recursive_object_array"}, "type": ["null", "object", "array"]}, "sdc_recursive_string_array": {"items": {"$ref": "#/definitions/sdc_recursive_string_array"}, "type": ["null", "string", "array"]}, "sdc_recursive_timestamp_array": {"format": "date-time", "items": {"$ref": "#/definitions/sdc_recursive_timestamp_array"}, "type": ["null", "string", "array"]}}, "properties": {"id": {"maximum": 9223372036854775807, "minimum": -9223372036854775808, "type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "object"}, "key_properties": ["id"], "bookmark_properties": ["id"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 52799009, "version": 1, "last_replication_method": "INCREMENTAL"}}, "currently_syncing": "tap_mysql_test-test_simple_table"}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 1, "results": "xyz1", "time_created": "not-formatted-time-1"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 2, "results": "xyz2", "time_created": "not-formatted-time-2"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 3, "results": "xyz3", "time_created": "not-formatted-time-3"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 4, "results": "xyz4", "time_created": "not-formatted-time-4"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
+{"type": "RECORD", "stream": "tap_mysql_test-test_simple_table", "record": {"id": 5, "results": "xyz5", "time_created": "not-formatted-time-5"}, "version": 1, "time_extracted": "2019-12-17T19:12:12.006049Z"}
diff --git a/tests/unit/resources/streams_only_state.json b/tests/unit/resources/streams_only_state.json
new file mode 100644
index 00000000..ba4b6f80
--- /dev/null
+++ b/tests/unit/resources/streams_only_state.json
@@ -0,0 +1,11 @@
+{"type": "SCHEMA", "stream": "tap_mysql_test-test_simple_table", "schema": {"properties": {"id": {"maximum": 9223372036854775807, "minimum": -9223372036854775808, "type": ["integer"]}, "results": {"type": ["null", "string"]}, "time_created": {"type": ["null", "string"]}}, "type": "date-time"}, "key_properties": ["id"], "bookmark_properties": ["id"]}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 1, "version": 1}}}}
+{"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_simple_table", "version": 1}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 2, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 3, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 5, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 6, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 7, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 10, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 45, "version": 1}}}}
+{"type": "STATE", "value": {"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", "replication_key_value": 100, "version": 1}}}}
diff --git a/tests/unit/test_db_sync.py b/tests/unit/test_db_sync.py
index 15c112ac..e63c86a2 100644
--- a/tests/unit/test_db_sync.py
+++ b/tests/unit/test_db_sync.py
@@ -1,11 +1,10 @@
-import unittest
import json
+import unittest
-from unittest.mock import patch
+from unittest.mock import patch, call
from target_snowflake import db_sync
-from target_snowflake.file_format import FileFormatTypes
-from target_snowflake.exceptions import InvalidFileFormatException, FileFormatNotFoundException
+from target_snowflake.exceptions import PrimaryKeyNotFoundException
class TestDBSync(unittest.TestCase):
@@ -21,6 +20,8 @@ def setUp(self):
'str_or_null': {"type": ["string", "null"]},
'dt': {"type": ["string"], "format": "date-time"},
'dt_or_null': {"type": ["string", "null"], "format": "date-time"},
+ 'd': {"type": ["string"], "format": "date"},
+ 'd_or_null': {"type": ["string", "null"], "format": "date"},
'time': {"type": ["string"], "format": "time"},
'time_or_null': {"type": ["string", "null"], "format": "time"},
'binary': {"type": ["string", "null"], "format": "binary"},
@@ -86,6 +87,11 @@ def test_config_validation(self):
config_with_external_stage['stage'] = 'dummy-value'
self.assertGreater(len(validator(config_with_external_stage)), 0)
+ # Configuration with archive_load_files but no s3_bucket
+ config_with_archive_load_files = minimal_config.copy()
+ config_with_archive_load_files['archive_load_files'] = True
+ self.assertGreater(len(validator(config_with_external_stage)), 0)
+
def test_column_type_mapping(self):
"""Test JSON type to Snowflake column type mappings"""
mapper = db_sync.column_type
@@ -96,6 +102,8 @@ def test_column_type_mapping(self):
'str_or_null': 'text',
'dt': 'timestamp_ntz',
'dt_or_null': 'timestamp_ntz',
+ 'd': 'date',
+ 'd_or_null': 'date',
'time': 'time',
'time_or_null': 'time',
'binary': 'binary',
@@ -121,6 +129,8 @@ def test_column_trans(self):
'str_or_null': '',
'dt': '',
'dt_or_null': '',
+ 'd': '',
+ 'd_or_null': '',
'time': '',
'time_or_null': '',
'binary': 'to_binary',
@@ -140,13 +150,14 @@ def test_create_query_tag(self):
self.assertIsNone(db_sync.create_query_tag(None))
self.assertEqual(db_sync.create_query_tag('This is a test query tag'), 'This is a test query tag')
self.assertEqual(db_sync.create_query_tag('Loading into {{database}}.{{schema}}.{{table}}',
- database='test_database',
- schema='test_schema',
- table='test_table'), 'Loading into test_database.test_schema.test_table')
+ database='test_database',
+ schema='test_schema',
+ table='test_table'),
+ 'Loading into test_database.test_schema.test_table')
self.assertEqual(db_sync.create_query_tag('Loading into {{database}}.{{schema}}.{{table}}',
- database=None,
- schema=None,
- table=None), 'Loading into ..')
+ database=None,
+ schema=None,
+ table=None), 'Loading into ..')
# JSON formatted query tags with variables
json_query_tag = db_sync.create_query_tag(
@@ -190,7 +201,7 @@ def test_create_query_tag(self):
@patch('target_snowflake.db_sync.DbSync.query')
def test_parallelism(self, query_patch):
- query_patch.return_value = [{ 'type': 'CSV' }]
+ query_patch.return_value = [{'type': 'CSV'}]
minimal_config = {
'account': "dummy-value",
@@ -212,12 +223,49 @@ def test_parallelism(self, query_patch):
self.assertEqual(db_sync.DbSync({**minimal_config,
**external_stage_with_parallel}).connection_config['parallelism'], 5)
- # Using snowflake table stages should enforce single thread parallelism
+ # Using table stages should allow parallelism
table_stage_with_parallel = {
'parallelism': 5
}
self.assertEqual(db_sync.DbSync({**minimal_config,
- **table_stage_with_parallel}).connection_config['parallelism'], 1)
+ **table_stage_with_parallel}).connection_config['parallelism'], 5)
+
+ @patch('target_snowflake.upload_clients.s3_upload_client.S3UploadClient.copy_object')
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_copy_to_archive(self, query_patch, copy_object_patch):
+ query_patch.return_value = [{'type': 'CSV'}]
+ minimal_config = {
+ 'account': "dummy-value",
+ 'dbname': "dummy-value",
+ 'user': "dummy-value",
+ 'password': "dummy-value",
+ 'warehouse': "dummy-value",
+ 'default_target_schema': "dummy-value",
+ 'file_format': "dummy-value",
+ 's3_bucket': 'dummy-bucket',
+ 'stage': 'dummy_schema.dummy_stage'
+ }
+
+ # Assert default values (same bucket, 'archive' as the archive prefix)
+ s3_config = {}
+ dbsync = db_sync.DbSync({**minimal_config, **s3_config})
+ dbsync.copy_to_archive('source/file', 'tap/schema/file', {'meta': "data"})
+
+ self.assertEqual(copy_object_patch.call_args[0][0], 'dummy-bucket/source/file')
+ self.assertEqual(copy_object_patch.call_args[0][1], 'dummy-bucket')
+ self.assertEqual(copy_object_patch.call_args[0][2], 'archive/tap/schema/file')
+
+ # Assert custom archive bucket and prefix
+ s3_config = {
+ 'archive_load_files_s3_bucket': "custom-bucket",
+ 'archive_load_files_s3_prefix': "custom-prefix"
+ }
+ dbsync = db_sync.DbSync({**minimal_config, **s3_config})
+ dbsync.copy_to_archive('source/file', 'tap/schema/file', {'meta': "data"})
+
+ self.assertEqual(copy_object_patch.call_args[0][0], 'dummy-bucket/source/file')
+ self.assertEqual(copy_object_patch.call_args[0][1], 'custom-bucket')
+ self.assertEqual(copy_object_patch.call_args[0][2], 'custom-prefix/tap/schema/file')
def test_safe_column_name(self):
self.assertEqual(db_sync.safe_column_name("columnname"), '"COLUMNNAME"')
@@ -225,8 +273,352 @@ def test_safe_column_name(self):
self.assertEqual(db_sync.safe_column_name("column-name"), '"COLUMN-NAME"')
self.assertEqual(db_sync.safe_column_name("column name"), '"COLUMN NAME"')
- def json_element_name(self):
- self.assertEqual(db_sync.safe_column_name("columnname"), 'columnname"')
- self.assertEqual(db_sync.safe_column_name("columnName"), 'columnName"')
- self.assertEqual(db_sync.safe_column_name("column-name"), 'column-name')
- self.assertEqual(db_sync.safe_column_name('"column name"'), '"column name"')
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_record_primary_key_string(self, query_patch):
+ query_patch.return_value = [{'type': 'CSV'}]
+ minimal_config = {
+ 'account': "dummy-value",
+ 'dbname': "dummy-value",
+ 'user': "dummy-value",
+ 'password': "dummy-value",
+ 'warehouse': "dummy-value",
+ 'default_target_schema': "dummy-value",
+ 'file_format': "dummy-value"
+ }
+
+ stream_schema_message = {"stream": "public-table1",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]},
+ "c_bool": {"type": ["boolean"]}
+ }},
+ "key_properties": ["id"]}
+
+ # Single primary key string
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ self.assertEqual(dbsync.record_primary_key_string({'id': 123}), '123')
+
+ # Composite primary key string
+ stream_schema_message['key_properties'] = ['id', 'c_str']
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ self.assertEqual(dbsync.record_primary_key_string({'id': 123, 'c_str': 'xyz'}), '123,xyz')
+
+ # Missing field as PK
+ stream_schema_message['key_properties'] = ['invalid_col']
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ with self.assertRaisesRegex(PrimaryKeyNotFoundException,
+ r"Primary key 'invalid_col' does not exist in record or is null\. Available "
+ r"fields: \['id', 'c_str'\]"):
+ dbsync.record_primary_key_string({'id': 123, 'c_str': 'xyz'})
+
+ # Null PK field
+ stream_schema_message['key_properties'] = ['id']
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ with self.assertRaisesRegex(PrimaryKeyNotFoundException,
+ r"Primary key 'id' does not exist in record or is null\. Available "
+ r"fields: \['id', 'c_str'\]"):
+ dbsync.record_primary_key_string({'id': None, 'c_str': 'xyz'})
+
+ # falsy PK field accepted
+ stream_schema_message['key_properties'] = ['id']
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ self.assertEqual(dbsync.record_primary_key_string({'id': 0, 'c_str': 'xyz'}), '0')
+
+ # falsy PK field accepted
+ stream_schema_message['key_properties'] = ['id', 'c_bool']
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ self.assertEqual(dbsync.record_primary_key_string({'id': 1, 'c_bool': False, 'c_str': 'xyz'}), '1,False')
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ @patch('target_snowflake.db_sync.DbSync._load_file_merge')
+ def test_merge_failure_message(self, load_file_merge_patch, query_patch):
+ LOGGER_NAME = "target_snowflake"
+ query_patch.return_value = [{'type': 'CSV'}]
+ minimal_config = {
+ 'account': "dummy_account",
+ 'dbname': "dummy_dbname",
+ 'user': "dummy_user",
+ 'password': "dummy_password",
+ 'warehouse': "dummy_warehouse",
+ 'default_target_schema': "dummy_default_target_schema",
+ 'file_format': "dummy_file_format",
+ }
+
+ stream_schema_message = {
+ "stream": "dummy_stream",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]}
+ }
+ },
+ "key_properties": ["id"]
+ }
+
+ # Single primary key string
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ load_file_merge_patch.side_effect = Exception()
+ expected_msg = (
+ f'ERROR:{LOGGER_NAME}:Error while executing MERGE query '
+ f'for table "{minimal_config["default_target_schema"]}."{stream_schema_message["stream"].upper()}"" '
+ f'in stream "{stream_schema_message["stream"]}"'
+ )
+ with self.assertRaises(Exception), self.assertLogs(logger=LOGGER_NAME, level="ERROR") as captured_logs:
+ dbsync.load_file(s3_key="dummy-key", count=256, size_bytes=256)
+ self.assertIn(expected_msg, captured_logs.output)
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ @patch('target_snowflake.db_sync.DbSync._load_file_copy')
+ def test_copy_failure_message(self, load_file_copy_patch, query_patch):
+ LOGGER_NAME = "target_snowflake"
+ query_patch.return_value = [{'type': 'CSV'}]
+ minimal_config = {
+ 'account': "dummy_account",
+ 'dbname': "dummy_dbname",
+ 'user': "dummy_user",
+ 'password': "dummy_password",
+ 'warehouse': "dummy_warehouse",
+ 'default_target_schema': "dummy_default_target_schema",
+ 'file_format': "dummy_file_format",
+ }
+
+ stream_schema_message = {
+ "stream": "dummy_stream",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]}
+ }
+ },
+ "key_properties": []
+ }
+
+ # Single primary key string
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message)
+ load_file_copy_patch.side_effect = Exception()
+ expected_msg = (
+ f'ERROR:{LOGGER_NAME}:Error while executing COPY query '
+ f'for table "{minimal_config["default_target_schema"]}."{stream_schema_message["stream"].upper()}"" '
+ f'in stream "{stream_schema_message["stream"]}"'
+ )
+ with self.assertRaises(Exception), self.assertLogs(logger=LOGGER_NAME, level="ERROR") as captured_logs:
+ dbsync.load_file(s3_key="dummy-key", count=256, size_bytes=256)
+ self.assertIn(expected_msg, captured_logs.output)
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_sync_table_with_no_changes_to_pk(self, query_patch):
+ minimal_config = {
+ 'account': "dummy-account",
+ 'dbname': "dummy-db",
+ 'user': "dummy-user",
+ 'password': "dummy-passwd",
+ 'warehouse': "dummy-wh",
+ 'default_target_schema': "dummy-schema",
+ 'file_format': "dummy-file-format"
+ }
+
+ stream_schema_message = {"stream": "public-table1",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]}}},
+ "key_properties": ["id"]}
+
+ table_cache = [
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'ID',
+ 'DATA_TYPE': 'NUMBER'
+ },
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'C_STR',
+ 'DATA_TYPE': 'TEXT'
+ }
+ ]
+ query_patch.side_effect = [
+ [{'type': 'CSV'}],
+ [{'column_name': 'ID'}],
+ None
+ ]
+
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message, table_cache)
+ dbsync.sync_table()
+
+ query_patch.assert_has_calls([
+ call('SHOW FILE FORMATS LIKE \'dummy-file-format\''),
+ call('show primary keys in table dummy-db.dummy-schema."TABLE1";'),
+ call(['alter table dummy-schema."TABLE1" alter column "ID" drop not null;'])
+ ])
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_sync_table_with_new_pk_in_stream(self, query_patch):
+ minimal_config = {
+ 'account': "dummy-account",
+ 'dbname': "dummy-db",
+ 'user': "dummy-user",
+ 'password': "dummy-passwd",
+ 'warehouse': "dummy-wh",
+ 'default_target_schema': "dummy-schema",
+ 'file_format': "dummy-file-format"
+ }
+
+ stream_schema_message = {"stream": "public-table1",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]},
+ "name": {"type": ["string"]},
+ }
+ },
+ "key_properties": ["id", "name"]}
+
+ table_cache = [
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'ID',
+ 'DATA_TYPE': 'NUMBER'
+ },
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'C_STR',
+ 'DATA_TYPE': 'TEXT'
+ },
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'NAME',
+ 'DATA_TYPE': 'TEXT'
+ }
+ ]
+ query_patch.side_effect = [
+ [{'type': 'CSV'}],
+ [{'column_name': 'ID'}],
+ None
+ ]
+
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message, table_cache)
+ dbsync.sync_table()
+
+ # due to usage of sets in the code, order of columns in queries in not guaranteed
+ # so have to break assertions to account for this.
+ calls = query_patch.call_args_list
+ self.assertEqual(3, len(calls))
+
+ self.assertEqual('SHOW FILE FORMATS LIKE \'dummy-file-format\'', calls[0][0][0])
+ self.assertEqual('show primary keys in table dummy-db.dummy-schema."TABLE1";', calls[1][0][0])
+
+ self.assertEqual('alter table dummy-schema."TABLE1" drop primary key;', calls[2][0][0][0])
+
+ self.assertIn(calls[2][0][0][1], {'alter table dummy-schema."TABLE1" add primary key("ID", "NAME");',
+ 'alter table dummy-schema."TABLE1" add primary key("NAME", "ID");'})
+
+ self.assertListEqual(sorted(calls[2][0][0][2:]),
+ [
+ 'alter table dummy-schema."TABLE1" alter column "ID" drop not null;',
+ 'alter table dummy-schema."TABLE1" alter column "NAME" drop not null;',
+ ]
+ )
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_sync_table_with_stream_that_changes_to_have_no_pk(self, query_patch):
+ minimal_config = {
+ 'account': "dummy-account",
+ 'dbname': "dummy-db",
+ 'user': "dummy-user",
+ 'password': "dummy-passwd",
+ 'warehouse': "dummy-wh",
+ 'default_target_schema': "dummy-schema",
+ 'file_format': "dummy-file-format"
+ }
+
+ stream_schema_message = {"stream": "public-table1",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]}}},
+ "key_properties": []}
+
+ table_cache = [
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'ID',
+ 'DATA_TYPE': 'NUMBER'
+ },
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'C_STR',
+ 'DATA_TYPE': 'TEXT'
+ }
+ ]
+ query_patch.side_effect = [
+ [{'type': 'CSV'}],
+ [{'column_name': 'ID'}],
+ None
+ ]
+
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message, table_cache)
+ dbsync.sync_table()
+
+ query_patch.assert_has_calls([
+ call('SHOW FILE FORMATS LIKE \'dummy-file-format\''),
+ call('show primary keys in table dummy-db.dummy-schema."TABLE1";'),
+ call(['alter table dummy-schema."TABLE1" drop primary key;',
+ 'alter table dummy-schema."TABLE1" alter column "ID" drop not null;'])
+ ])
+
+ @patch('target_snowflake.db_sync.DbSync.query')
+ def test_sync_table_with_stream_that_has_no_pk_but_get_a_new_one(self, query_patch):
+ minimal_config = {
+ 'account': "dummy-account",
+ 'dbname': "dummy-db",
+ 'user': "dummy-user",
+ 'password': "dummy-passwd",
+ 'warehouse': "dummy-wh",
+ 'default_target_schema': "dummy-schema",
+ 'file_format': "dummy-file-format"
+ }
+
+ stream_schema_message = {"stream": "public-table1",
+ "schema": {
+ "properties": {
+ "id": {"type": ["integer"]},
+ "c_str": {"type": ["null", "string"]}}},
+ "key_properties": ['id']}
+
+ table_cache = [
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'ID',
+ 'DATA_TYPE': 'NUMBER'
+ },
+ {
+ 'SCHEMA_NAME': 'DUMMY-SCHEMA',
+ 'TABLE_NAME': 'TABLE1',
+ 'COLUMN_NAME': 'C_STR',
+ 'DATA_TYPE': 'TEXT'
+ }
+ ]
+ query_patch.side_effect = [
+ [{'type': 'CSV'}],
+ [],
+ None
+ ]
+
+ dbsync = db_sync.DbSync(minimal_config, stream_schema_message, table_cache)
+ dbsync.sync_table()
+
+ query_patch.assert_has_calls([
+ call('SHOW FILE FORMATS LIKE \'dummy-file-format\''),
+ call('show primary keys in table dummy-db.dummy-schema."TABLE1";'),
+ call(['alter table dummy-schema."TABLE1" add primary key("ID");',
+ 'alter table dummy-schema."TABLE1" alter column "ID" drop not null;'])
+ ])
diff --git a/tests/unit/test_file_format.py b/tests/unit/test_file_format.py
index 39c41dfa..01669b4f 100644
--- a/tests/unit/test_file_format.py
+++ b/tests/unit/test_file_format.py
@@ -1,12 +1,9 @@
-import datetime
import unittest
-
from unittest.mock import patch
-import target_snowflake.db_sync as db_sync
+from target_snowflake.exceptions import InvalidFileFormatException, FileFormatNotFoundException
from target_snowflake.file_format import FileFormat, FileFormatTypes
from target_snowflake.file_formats import csv, parquet
-from target_snowflake.exceptions import InvalidFileFormatException, FileFormatNotFoundException
class TestFileFormat(unittest.TestCase):
diff --git a/tests/unit/test_stream_utils.py b/tests/unit/test_stream_utils.py
index ba0a3d81..1bccd29a 100644
--- a/tests/unit/test_stream_utils.py
+++ b/tests/unit/test_stream_utils.py
@@ -4,6 +4,7 @@
import target_snowflake.stream_utils as stream_utils
from target_snowflake.exceptions import UnexpectedValueTypeException
+from target_snowflake.exceptions import UnexpectedMessageTypeException
class TestSchemaUtils(unittest.TestCase):
@@ -207,3 +208,32 @@ def test_stream_name_to_dict(self):
# Snowflake table format (Custom '.' separator)
self.assertEqual(stream_utils.stream_name_to_dict('my_catalog.my_schema.my_table', separator='.'),
{"catalog_name": "my_catalog", "schema_name": "my_schema", "table_name": "my_table"})
+
+ def test_get_incremental_key(self):
+ """Test selecting incremental key column from schema message"""
+
+ # Bookmark properties contains column which is also in schema properties
+ self.assertEqual(stream_utils.get_incremental_key(
+ {
+ "type": "SCHEMA",
+ "schema": {"properties": {"id": {}, "some_col": {}}},
+ "key_properties": ["id"],
+ "bookmark_properties": ["some_col"]
+ }), "some_col")
+
+ # Bookmark properties contains column which is not in schema properties
+ self.assertEqual(stream_utils.get_incremental_key(
+ {
+ "type": "SCHEMA",
+ "schema": {"properties": {"id": {}, "some_col": {}}},
+ "key_properties": ["id"],
+ "bookmark_properties": ["lsn"]
+ }), None)
+
+ with self.assertRaises(UnexpectedMessageTypeException):
+ stream_utils.get_incremental_key(
+ {
+ "type": "RECORD",
+ "stream": "some-stream",
+ "record": {}
+ })
diff --git a/tests/unit/test_target_snowflake.py b/tests/unit/test_target_snowflake.py
index 882d28bc..d3eb7637 100644
--- a/tests/unit/test_target_snowflake.py
+++ b/tests/unit/test_target_snowflake.py
@@ -1,7 +1,10 @@
+import io
+import json
import unittest
import os
import itertools
+from contextlib import redirect_stdout
from datetime import datetime, timedelta
from unittest.mock import patch
@@ -16,6 +19,7 @@ class TestTargetSnowflake(unittest.TestCase):
def setUp(self):
self.config = {}
+ self.maxDiff = None
@patch('target_snowflake.flush_streams')
@patch('target_snowflake.DbSync')
@@ -85,4 +89,88 @@ def test_persist_40_records_with_batch_wait_limit(self, dbSync_mock, flush_strea
target_snowflake.persist_lines(self.config, lines)
# Expecting flush after every records + 1 at the end
- assert flush_streams_mock.call_count == 41
\ No newline at end of file
+ self.assertEqual(flush_streams_mock.call_count, 41)
+
+ @patch('target_snowflake.DbSync')
+ @patch('target_snowflake.os.remove')
+ def test_archive_load_files_incremental_replication(self, os_remove_mock, dbSync_mock):
+ self.config['tap_id'] = 'test_tap_id'
+ self.config['archive_load_files'] = True
+ self.config['s3_bucket'] = 'dummy_bucket'
+
+ with open(f'{os.path.dirname(__file__)}/resources/messages-simple-table.json', 'r') as f:
+ lines = f.readlines()
+
+ instance = dbSync_mock.return_value
+ instance.create_schema_if_not_exists.return_value = None
+ instance.sync_table.return_value = None
+ instance.put_to_stage.return_value = 'some-s3-folder/some-name_date_batch_hash.csg.gz'
+
+ target_snowflake.persist_lines(self.config, lines)
+
+ copy_to_archive_args = instance.copy_to_archive.call_args[0]
+ self.assertEqual(copy_to_archive_args[0], 'some-s3-folder/some-name_date_batch_hash.csg.gz')
+ self.assertEqual(copy_to_archive_args[1], 'test_tap_id/test_simple_table/some-name_date_batch_hash.csg.gz')
+ self.assertDictEqual(copy_to_archive_args[2], {
+ 'tap': 'test_tap_id',
+ 'schema': 'tap_mysql_test',
+ 'table': 'test_simple_table',
+ 'archived-by': 'pipelinewise_target_snowflake',
+ 'incremental-key': 'id',
+ 'incremental-key-min': '1',
+ 'incremental-key-max': '5'
+ })
+
+ @patch('target_snowflake.DbSync')
+ @patch('target_snowflake.os.remove')
+ def test_archive_load_files_log_based_replication(self, os_remove_mock, dbSync_mock):
+ self.config['tap_id'] = 'test_tap_id'
+ self.config['archive_load_files'] = True
+
+ with open(f'{os.path.dirname(__file__)}/resources/logical-streams.json', 'r') as f:
+ lines = f.readlines()
+
+ instance = dbSync_mock.return_value
+ instance.create_schema_if_not_exists.return_value = None
+ instance.sync_table.return_value = None
+ instance.put_to_stage.return_value = 'some-s3-folder/some-name_date_batch_hash.csg.gz'
+
+ target_snowflake.persist_lines(self.config, lines)
+
+ copy_to_archive_args = instance.copy_to_archive.call_args[0]
+ self.assertEqual(copy_to_archive_args[0], 'some-s3-folder/some-name_date_batch_hash.csg.gz')
+ self.assertEqual(copy_to_archive_args[1], 'test_tap_id/logical1_table2/some-name_date_batch_hash.csg.gz')
+ self.assertDictEqual(copy_to_archive_args[2], {
+ 'tap': 'test_tap_id',
+ 'schema': 'logical1',
+ 'table': 'logical1_table2',
+ 'archived-by': 'pipelinewise_target_snowflake'
+ })
+
+ @patch('target_snowflake.flush_streams')
+ @patch('target_snowflake.DbSync')
+ def test_persist_lines_with_only_state_messages(self, dbSync_mock, flush_streams_mock):
+ """
+ Given only state messages, target should emit the last one
+ """
+
+ self.config['batch_size_rows'] = 5
+
+ with open(f'{os.path.dirname(__file__)}/resources/streams_only_state.json', 'r') as f:
+ lines = f.readlines()
+
+ instance = dbSync_mock.return_value
+ instance.create_schema_if_not_exists.return_value = None
+ instance.sync_table.return_value = None
+
+ # catch stdout
+ buf = io.StringIO()
+ with redirect_stdout(buf):
+ target_snowflake.persist_lines(self.config, lines)
+
+ flush_streams_mock.assert_not_called()
+
+ self.assertEqual(
+ buf.getvalue().strip(),
+ '{"bookmarks": {"tap_mysql_test-test_simple_table": {"replication_key": "id", '
+ '"replication_key_value": 100, "version": 1}}}')