From 4261bfe47fcf7c8c5a6b7bc880e00745677f3dfc Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 29 Apr 2024 15:33:31 -0400 Subject: [PATCH 01/19] refactor(linting): fix linting issues Unify the base configuration for Ruff by extending the pyproject.toml from the public repo. Also fixes all linting errors. --- invocations/ci.py | 2 +- invocations/dev.py | 2 +- pyproject.toml | 57 +++++++++--------------- testgen/commands/run_get_entities.py | 2 +- testgen/commands/run_profiling_bridge.py | 2 +- testgen/commands/run_quick_start.py | 2 +- testgen/common/read_file.py | 2 +- testgen/ui/app.py | 3 +- testgen/ui/views/connections.py | 6 +-- testgen/ui/views/profiling_summary.py | 2 +- testgen/ui/views/test_definitions.py | 2 +- 11 files changed, 34 insertions(+), 48 deletions(-) diff --git a/invocations/ci.py b/invocations/ci.py index 0488c3a..184f08b 100644 --- a/invocations/ci.py +++ b/invocations/ci.py @@ -2,7 +2,7 @@ Release and CI/CD tasks belong here. """ -__all__ = ["ci_dotenv", "check_valid_release_type"] +__all__ = ["check_valid_release_type", "ci_dotenv"] import os diff --git a/invocations/dev.py b/invocations/dev.py index 1e877ce..09ee921 100644 --- a/invocations/dev.py +++ b/invocations/dev.py @@ -1,4 +1,4 @@ -__all__ = ["install", "lint", "clean", "build_public_image"] +__all__ = ["build_public_image", "clean", "install", "lint"] from os.path import exists, join from shutil import rmtree, which diff --git a/pyproject.toml b/pyproject.toml index 6f3f767..10c1b94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,10 +63,8 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black[d]==24.3.0", "invoke==2.2.0", - "isort==5.13.2", - "ruff==0.3.4", + "ruff==0.4.1", "mypy==1.9.0", "pre-commit==3.6.2", "types-PyYAML==6.0.12.20240311", @@ -170,36 +168,27 @@ exclude = [ 'build/', ] -[tool.isort] -profile = "black" -line_length = 120 - -[tool.black] -line-length = 120 -target-version = ['py310'] -include = '\.pyi?$' -exclude = ''' -( - /( - \.eggs # exclude a few common directories in the - | \.git # root of the project - | \.mypy_cache - | \.tox - | \.venv - | env - | venv - | _build - | build - | dist - )/ - | foo.py # also separately exclude a file named foo.py in - # the root of the project -) -''' - [tool.ruff] target-version = "py310" line-length = 120 +indent-width = 4 +include = [ + "invocations/**/*.py", + "testgen/**/*.py", + "tests/**/*.py", +] +exclude = [ + ".eggs", + ".git", + ".mypy_cache", + ".tox", + ".venv", + "env", + "venv", + "_build", + "build", + "dist", +] [tool.ruff.lint] # see: https://beta.ruff.rs/docs/rules. @@ -224,8 +213,8 @@ select = ["A", "F", "S", "I", "T10", "B", "UP", "ISC", "T20", "RSE", "Q", "ARG", # globally ignore the following error codes # * TRY003: Avoid specifying long messages outside the exception class # * S608: Hardcoded SQL -# * I001: Unsorted imports, partially incompatible with isorts suggestions -ignore = ["TRY003", "S608", "I001"] +# # F841: Unused local variable (it is instable) +ignore = ["TRY003", "S608", "S404", "F841"] # Ignore the following errors in files: # F403 - in __init__.py: We use __all__ in our module files so this behavior is acceptable in __init__.py @@ -235,10 +224,8 @@ ignore = ["TRY003", "S608", "I001"] "testgen/__main__.py" = ["ARG001", "S603"] "tasks.py" = ["F403"] "tests*" = ["S101", "T201"] -# print and empty arguments are fine here. "invocations/**" = ["ARG001", "T201"] - - +"testgen/common/encrypt.py" = ["S413"] # See: https://coverage.readthedocs.io/en/latest/config.html [tool.coverage.run] diff --git a/testgen/commands/run_get_entities.py b/testgen/commands/run_get_entities.py index c5e558d..19987ea 100644 --- a/testgen/commands/run_get_entities.py +++ b/testgen/commands/run_get_entities.py @@ -31,7 +31,7 @@ def run_list_connections(): def run_get_connection(connection_id): sql_template = read_template_sql_file("get_connection.sql", "get_entities") sql_template = sql_template.replace("{CONNECTION_ID}", str(connection_id)) - rows, header = RetrieveDBResultsToList("DKTG", sql_template) + rows, _ = RetrieveDBResultsToList("DKTG", sql_template) return rows.pop() diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index 172d9c9..a303f31 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -311,7 +311,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None): strQuery = clsProfiling.GetTableSampleCount() lstQueries.append(strQuery) - lstSampleTables, lstSampleColumnNames, intErrors = RunThreadedRetrievalQueryList( + lstSampleTables, _, intErrors = RunThreadedRetrievalQueryList( "PROJECT", lstQueries, dctParms["max_threads"], spinner ) dctSampleTables = {x[0]: [x[1], x[2]] for x in lstSampleTables} diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index 1393cc0..f0b28e5 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -133,7 +133,7 @@ def run_quick_start(delete_target_db: bool) -> None: # Get table group id project_key = params_mapping["PROJECT_KEY"] - rows, header = run_table_group_list(project_key) + rows, _ = run_table_group_list(project_key) connection_id = str(rows[0][2]) # run qc diff --git a/testgen/common/read_file.py b/testgen/common/read_file.py index 6cdd22c..034e9ca 100644 --- a/testgen/common/read_file.py +++ b/testgen/common/read_file.py @@ -1,4 +1,4 @@ -__all__ = ["read_template_sql_file", "read_template_yaml_file", "get_template_files"] +__all__ = ["get_template_files", "read_template_sql_file", "read_template_yaml_file"] import logging import re diff --git a/testgen/ui/app.py b/testgen/ui/app.py index d72a2a0..e1cbd55 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -8,9 +8,8 @@ from testgen.ui import bootstrap from testgen.ui.components import widgets as testgen from testgen.ui.queries import project_queries -from testgen.ui.services import authentication_service +from testgen.ui.services import authentication_service, javascript_service from testgen.ui.services import database_service as db -from testgen.ui.services import javascript_service from testgen.ui.session import session logger = logging.getLogger("testgen.ui") diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 0ed8f94..af868f2 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -85,7 +85,7 @@ def show_create_qc_schema_modal(modal, selected_connection): if submit: empty_cache() - bottom_left_column, bottom_right_column = st.columns([0.20, 0.80]) + _, bottom_right_column = st.columns([0.20, 0.80]) operation_status = bottom_right_column.empty() operation_status.empty() @@ -117,9 +117,9 @@ def show_connection_form(connection, project_code): flavor_options = ["redshift", "snowflake", "mssql", "postgresql"] left_column, right_column = st.columns([0.75, 0.25]) - toggle_left_column, toggle_right_column = st.columns([0.25, 0.75]) + toggle_left_column, _ = st.columns([0.25, 0.75]) bottom_left_column, bottom_right_column = st.columns([0.25, 0.75]) - button_left_column, button_right_column, button_remaining_column = st.columns([0.20, 0.20, 0.60]) + button_left_column, _, _ = st.columns([0.20, 0.20, 0.60]) connection_id = connection["connection_id"] connection_name = connection["connection_name"] diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index d2ca3ae..f48b759 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -134,7 +134,7 @@ def open_drill_downs(dct_selected_rows, button_slots): def show_record_detail(dct_selected_row): - layout_column_1, layout_column_2 = st.columns([0.5, 0.5]) + layout_column_1, _ = st.columns([0.5, 0.5]) with layout_column_1: str_header = "Profiling Run Information" diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index f6e1272..2762677 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -59,7 +59,7 @@ def render(self, **_) -> None: # Prompt for Table Group with tool_bar.long_slots[1]: - str_table_groups_id, str_connection_id, str_schema, table_group = prompt_for_table_group( + str_table_groups_id, str_connection_id, _, table_group = prompt_for_table_group( session.project, table_group, str_connection_id ) From e02d8121737c8951dcde932dcfe14d45d52a4752 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 29 Apr 2024 17:08:36 -0400 Subject: [PATCH 02/19] refactor(typing): fix mypy reported issues Add some missing params and return types and guard against possible None return values --- .gitattributes | 1 - .pre-commit-config.yaml | 6 +----- invocations/ci.py | 3 ++- invocations/dev.py | 24 +++++++++++++++--------- invocations/toolbox.py | 4 ++-- 5 files changed, 20 insertions(+), 18 deletions(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 51c1c5f..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -.gitlab-ci.yml merge=ours diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 75cffc3..b7ffa30 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,12 +7,8 @@ repos: - id: no-commit-to-branch args: [--branch, staging, --branch, main, --branch, production] - id: trailing-whitespace - - repo: https://github.com/psf/black - rev: 24.3.0 - hooks: - - id: black - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.3.4' + rev: 'v0.4.1' hooks: - id: ruff verbose: true diff --git a/invocations/ci.py b/invocations/ci.py index 184f08b..b86dd37 100644 --- a/invocations/ci.py +++ b/invocations/ci.py @@ -8,7 +8,8 @@ import os import semver -from invoke import Exit, task +from invoke.exceptions import Exit +from invoke.tasks import task @task diff --git a/invocations/dev.py b/invocations/dev.py index 09ee921..731c19b 100644 --- a/invocations/dev.py +++ b/invocations/dev.py @@ -4,7 +4,9 @@ from shutil import rmtree, which import tomli -from invoke import Exit, task +from invoke.context import Context +from invoke.exceptions import Exit +from invoke.tasks import task from .toolbox import ensure_tools @@ -13,12 +15,12 @@ @task -def required_tools(ctx): +def required_tools(ctx: Context) -> None: ensure_tools("git", "find", "docker") @task -def install(ctx, quiet_pip=False): +def install(ctx: Context, quiet_pip: bool = False) -> None: """Installs the package as a developer (editable, all optional dependencies).""" if quiet_pip: print("testgen package is being re-installed.") @@ -26,7 +28,7 @@ def install(ctx, quiet_pip=False): @task -def lint(ctx): +def lint(ctx: Context) -> None: """Runs the standard suite of quality/linting tools.""" ctx.run("isort .") ctx.run("black .") @@ -35,7 +37,7 @@ def lint(ctx): @task -def precommit(ctx, all_files=False): +def precommit(ctx: Context, all_files: bool = False) -> None: """Runs pre-commit.""" if which("pre-commit") is None: install(ctx) @@ -47,10 +49,13 @@ def precommit(ctx, all_files=False): @task(pre=(required_tools,)) -def clean(ctx): +def clean(ctx: Context) -> None: """Deletes old python files and build artifacts""" - repo_root = ctx.run("git rev-parse --show-toplevel", hide=True).stdout.strip() + result = ctx.run("git rev-parse --show-toplevel", hide=True) + if not result: + raise Exit("Failure running git rev-parse") + repo_root = result.stdout.strip() with open(join(repo_root, "pyproject.toml"), "rb") as f: project_name: str = tomli.load(f)["project"]["name"] @@ -62,13 +67,14 @@ def clean(ctx): @task(pre=(required_tools,)) -def build_public_image(ctx, version: str, push=False, local=False): +def build_public_image(ctx: Context, version: str, push: bool = False, local: bool = False) -> None: """Builds and pushes the TestGen image""" use_cmd = f"docker buildx use {DOCKER_BUILDER_NAME}" if push and local: raise Exit("Cannot use --local and --push at the same time.") - if not ctx.run(use_cmd, hide=True, warn=True).ok: + + if (result := ctx.run(use_cmd, hide=True, warn=True)) and not result.ok: ctx.run(f"docker buildx create --name {DOCKER_BUILDER_NAME} --platform {DOCKER_BUILDER_PLATFORMS}") ctx.run(use_cmd) diff --git a/invocations/toolbox.py b/invocations/toolbox.py index 6a8739d..8d913ab 100644 --- a/invocations/toolbox.py +++ b/invocations/toolbox.py @@ -1,9 +1,9 @@ from shutil import which -from invoke import Exit +from invoke.exceptions import Exit -def ensure_tools(*tools): +def ensure_tools(*tools: str) -> None: """ Check the PATH to see if the required tools exist. e.g., From 071d4190e59318d4b9d50301aa2e32686f6945cf Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 29 Apr 2024 19:05:19 -0400 Subject: [PATCH 03/19] ci(commands): update configuration --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 10c1b94..10a35ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,10 +162,10 @@ show_error_context = true # Mypy appears smart enough to ignore hidden directories. But it needs to scan __pycache__ for .pyc and pyi files, # so it cannot honor gitignore. exclude = [ - '''^(?:.*\/)+[tT]ests?''', 'conftest.py', 'venv/', 'build/', + 'tests/', ] [tool.ruff] @@ -244,7 +244,7 @@ skip_empty=true [tool.bumpver] current_version = "2.0.0" version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]" -commit_message = "Bump version: {old_version} -> {new_version}" +commit_message = "release: {old_version} -> {new_version}" commit = true tag = true push = false From bfeefde092e57f786595076447e22da53c2bd4da Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Wed, 1 May 2024 18:26:41 -0400 Subject: [PATCH 04/19] feat: implement a cascade delete of entities --- .../050_populate_new_schema_metadata.sql | 4 +- testgen/ui/queries/connection_queries.py | 7 +++ testgen/ui/queries/table_group_queries.py | 61 +++++++++++++++++++ testgen/ui/queries/test_definition_queries.py | 10 +++ testgen/ui/queries/test_run_queries.py | 17 ++++++ testgen/ui/queries/test_suite_queries.py | 22 ++++++- testgen/ui/services/connection_service.py | 31 +++++++++- testgen/ui/services/table_group_service.py | 48 ++++++++++++--- .../ui/services/test_definition_service.py | 7 +++ testgen/ui/services/test_run_service.py | 8 +++ testgen/ui/services/test_suite_service.py | 27 ++++++-- testgen/ui/views/table_groups.py | 31 ++++++---- testgen/ui/views/test_suites.py | 33 ++++++---- 13 files changed, 265 insertions(+), 41 deletions(-) create mode 100644 testgen/ui/queries/test_run_queries.py create mode 100644 testgen/ui/services/test_run_service.py diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 302d98b..ad7c453 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -48,7 +48,7 @@ n controls over data ingested and to make values more efficient, consistent and AND p.column_name NOT ILIKE ''%zip%'' AND p.functional_data_type NOT ILIKE ''id%'' AND p.value_ct > p.numeric_ct - AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.'), + AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.'), ('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A'' AND p.value_ct > p.date_ct AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)' , 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.'), @@ -85,7 +85,7 @@ n controls over data ingested and to make values more efficient, consistent and 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', e'p.general_type = \'A\' AND p.numeric_ct::FLOAT / p.record_ct::FLOAT < 0.03 AND p.numeric_ct > 0', - ''' Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', + '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.'), ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'') diff --git a/testgen/ui/queries/connection_queries.py b/testgen/ui/queries/connection_queries.py index 0507f35..b7424b9 100644 --- a/testgen/ui/queries/connection_queries.py +++ b/testgen/ui/queries/connection_queries.py @@ -1,3 +1,4 @@ +import pandas as pd import streamlit as st import testgen.ui.services.database_service as db @@ -30,6 +31,12 @@ def get_connections(project_code): return db.retrieve_data(str_sql) +def get_table_group_names_by_connection(schema: str, connection_ids: list[str]) -> pd.DataFrame: + items = [f"'{item}'" for item in connection_ids] + str_sql = f"""select table_groups_name from {schema}.table_groups where connection_id in ({",".join(items)})""" + return db.retrieve_data(str_sql) + + def edit_connection(schema, connection, encrypted_password): sql = f"""UPDATE {schema}.connections SET project_code = '{connection["project_code"]}', diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index 275f8b1..a1ce0d5 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -25,6 +25,46 @@ def get_by_id(schema, table_group_id): return db.retrieve_data(sql) +def get_test_suite_names_by_table_group_names(schema, table_group_names): + items = [f"'{item}'" for item in table_group_names] + sql = f"""select test_suite +from {schema}.test_suites ts +inner join {schema}.table_groups tg on tg.id = ts.table_groups_id +where tg.table_groups_name in ({",".join(items)}) + """ + return db.retrieve_data(sql) + + +def get_table_group_dependencies(schema, table_group_names): + if table_group_names is None or len(table_group_names) == 0: + raise ValueError("No Table Group is specified.") + + table_group_items = [f"'{item}'" for item in table_group_names] + sql = f"""select ppr.profile_run_id from {schema}.profile_pair_rules ppr + INNER JOIN {schema}.profiling_runs pr ON pr.id = ppr.profile_run_id + INNER JOIN {schema}.table_groups tg ON tg.id = pr.table_groups_id + where tg.table_groups_name in ({",".join(table_group_items)}) + union + select par.table_groups_id from {schema}.profile_anomaly_results par INNER JOIN {schema}.table_groups tg ON tg.id = par.table_groups_id where tg.table_groups_name in ({",".join(table_group_items)}) + union + select pr.table_groups_id from {schema}.profile_results pr INNER JOIN {schema}.table_groups tg ON tg.id = pr.table_groups_id where tg.table_groups_name in ({",".join(table_group_items)}) + union + select pr.table_groups_id from {schema}.profiling_runs pr INNER JOIN {schema}.table_groups tg ON tg.id = pr.table_groups_id where tg.table_groups_name in ({",".join(table_group_items)}) + union + select dtc.table_groups_id from {schema}.data_table_chars dtc INNER JOIN {schema}.table_groups tg ON tg.id = dtc.table_groups_id where tg.table_groups_name in ({",".join(table_group_items)}) + union + select dcs.table_groups_id from {schema}.data_column_chars dcs INNER JOIN {schema}.table_groups tg ON tg.id = dcs.table_groups_id where tg.table_groups_name in ({",".join(table_group_items)});""" + return db.retrieve_data(sql) + + +def get_table_group_usage(schema, table_group_names): + items = [f"'{item}'" for item in table_group_names] + sql = f"""select distinct pr.id from {schema}.profiling_runs pr +INNER JOIN {schema}.table_groups tg ON tg.id = pr.table_groups_id +where tg.table_groups_name in ({",".join(items)}) and pr.status = 'Running'""" + return db.retrieve_data(sql) + + @st.cache_data(show_spinner=False) def get_by_connection(schema, project_code, connection_id): sql = _get_select_statement(schema) @@ -100,3 +140,24 @@ def delete(schema, table_group_ids): sql = f"""DELETE FROM {schema}.table_groups WHERE id in ({",".join(items)})""" db.execute_sql(sql) st.cache_data.clear() + + +def cascade_delete(schema, table_group_names): + if table_group_names is None or len(table_group_names) == 0: + raise ValueError("No Table Group is specified.") + + table_group_items = [f"'{item}'" for item in table_group_names] + sql = f"""delete from {schema}.profile_pair_rules ppr +USING {schema}.profiling_runs pr, {schema}.table_groups tg +WHERE +pr.id = ppr.profile_run_id +AND tg.id = pr.table_groups_id +AND tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.profile_anomaly_results par USING {schema}.table_groups tg where tg.id = par.table_groups_id and tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.profile_results pr USING {schema}.table_groups tg where tg.id = pr.table_groups_id and tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.profiling_runs pr USING {schema}.table_groups tg where tg.id = pr.table_groups_id and tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.data_table_chars dtc USING {schema}.table_groups tg where tg.id = dtc.table_groups_id and tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.data_column_chars dcs USING {schema}.table_groups tg where tg.id = dcs.table_groups_id and tg.table_groups_name in ({",".join(table_group_items)}); +delete from {schema}.table_groups where table_groups_name in ({",".join(table_group_items)});""" + db.execute_sql(sql) + st.cache_data.clear() diff --git a/testgen/ui/queries/test_definition_queries.py b/testgen/ui/queries/test_definition_queries.py index e944df2..80f133b 100644 --- a/testgen/ui/queries/test_definition_queries.py +++ b/testgen/ui/queries/test_definition_queries.py @@ -271,3 +271,13 @@ def delete(schema, test_definition_ids): sql = f"""DELETE FROM {schema}.test_definitions WHERE id in ({",".join(items)})""" db.execute_sql(sql) st.cache_data.clear() + + +def cascade_delete(schema, test_suite_names): + if test_suite_names is None or len(test_suite_names) == 0: + raise ValueError("No Test Suite is specified.") + + items = [f"'{item}'" for item in test_suite_names] + sql = f"""delete from {schema}.test_definitions where test_suite in ({",".join(items)})""" + db.execute_sql(sql) + st.cache_data.clear() diff --git a/testgen/ui/queries/test_run_queries.py b/testgen/ui/queries/test_run_queries.py new file mode 100644 index 0000000..d9b4b10 --- /dev/null +++ b/testgen/ui/queries/test_run_queries.py @@ -0,0 +1,17 @@ +import streamlit as st + +import testgen.ui.services.database_service as db + + +def cascade_delete(schema: str, test_suite_names: list[str]) -> None: + if test_suite_names is None or len(test_suite_names) == 0: + raise ValueError("No Test Suite is specified.") + + items = [f"'{item}'" for item in test_suite_names] + sql = f"""delete from {schema}.working_agg_cat_results where test_suite in ({",".join(items)}); +delete from {schema}.working_agg_cat_tests where test_suite in ({",".join(items)}); +delete from {schema}.test_runs where test_suite in ({",".join(items)}); +delete from {schema}.test_results where test_suite in ({",".join(items)}); +delete from {schema}.execution_queue where test_suite in ({",".join(items)});""" + db.execute_sql(sql) + st.cache_data.clear() diff --git a/testgen/ui/queries/test_suite_queries.py b/testgen/ui/queries/test_suite_queries.py index 5c251db..7ea1c7b 100644 --- a/testgen/ui/queries/test_suite_queries.py +++ b/testgen/ui/queries/test_suite_queries.py @@ -1,3 +1,4 @@ +import pandas as pd import streamlit as st import testgen.ui.services.database_service as db @@ -76,7 +77,17 @@ def delete(schema, test_suite_ids): st.cache_data.clear() -def get_test_suite_usage(schema, test_suite_names): +def cascade_delete(schema: str, test_suite_names: list[str]) -> None: + if test_suite_names is None or len(test_suite_names) == 0: + raise ValueError("No Test Suite is specified.") + + items = [f"'{item}'" for item in test_suite_names] + sql = f"""delete from {schema}.test_suites where test_suite in ({",".join(items)})""" + db.execute_sql(sql) + st.cache_data.clear() + + +def get_test_suite_dependencies(schema: str, test_suite_names: list[str]) -> pd.DataFrame: test_suite_names_join = [f"'{item}'" for item in test_suite_names] sql = f""" select distinct test_suite from {schema}.test_definitions where test_suite in ({",".join(test_suite_names_join)}) @@ -86,3 +97,12 @@ def get_test_suite_usage(schema, test_suite_names): select distinct test_suite from {schema}.test_results where test_suite in ({",".join(test_suite_names_join)}); """ return db.retrieve_data(sql) + + + +def get_test_suite_usage(schema: str, test_suite_names: list[str]) -> pd.DataFrame: + test_suite_names_join = [f"'{item}'" for item in test_suite_names] + sql = f""" + select distinct test_suite from {schema}.test_runs where test_suite in ({",".join(test_suite_names_join)}) and status = 'Running' + """ + return db.retrieve_data(sql) diff --git a/testgen/ui/services/connection_service.py b/testgen/ui/services/connection_service.py index a1d2af8..0ada931 100644 --- a/testgen/ui/services/connection_service.py +++ b/testgen/ui/services/connection_service.py @@ -1,8 +1,9 @@ import streamlit as st -import testgen.ui.queries.connection_queries as connection_queries from testgen.commands.run_profiling_bridge import InitializeProfilingSQL from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools +import testgen.ui.queries.connection_queries as connection_queries +import testgen.ui.services.table_group_service as table_group_service from testgen.common.database.database_service import ( AssignConnectParms, RetrieveDBResultsToList, @@ -60,6 +61,34 @@ def delete_connections(connection_ids): return connection_queries.delete_connections(schema, connection_ids) +def cascade_delete(connection_ids, dry_run=False): + schema = st.session_state["dbschema"] + can_be_deleted = True + table_group_names = get_table_group_names_by_connection(connection_ids) + connection_has_dependencies = table_group_names is not None and len(table_group_names) > 0 + if connection_has_dependencies: + can_be_deleted = False + if not dry_run: + if connection_has_dependencies: + table_group_service.cascade_delete(table_group_names) + connection_queries.delete_connections(schema, connection_ids) + return can_be_deleted + + +def are_connections_in_use(connection_ids): + table_group_names = get_table_group_names_by_connection(connection_ids) + table_groups_in_use = table_group_service.are_table_groups_in_use(table_group_names) + return table_groups_in_use + + +def get_table_group_names_by_connection(connection_ids): + if not connection_ids: + return [] + schema = st.session_state["dbschema"] + table_group_names = connection_queries.get_table_group_names_by_connection(schema, connection_ids) + return table_group_names.to_dict()["table_groups_name"].values() + + def init_profiling_sql(project_code, connection, table_group_schema=None): # get connection data empty_cache() diff --git a/testgen/ui/services/table_group_service.py b/testgen/ui/services/table_group_service.py index b660e7c..3a1125e 100644 --- a/testgen/ui/services/table_group_service.py +++ b/testgen/ui/services/table_group_service.py @@ -1,8 +1,9 @@ import streamlit as st +from testgen.common.database.database_service import RetrieveDBResultsToDictList import testgen.ui.queries.table_group_queries as table_group_queries import testgen.ui.services.connection_service as connection_service -from testgen.common.database.database_service import RetrieveDBResultsToDictList +import testgen.ui.services.test_suite_service as test_suite_service def get_by_id(table_group_id: str): @@ -25,17 +26,46 @@ def add(table_group): table_group_queries.add(schema, table_group) -def delete(table_group_ids, table_group_names, dry_run=False): # noqa ARG001 +def cascade_delete(table_group_names, dry_run=False): schema = st.session_state["dbschema"] + test_suite_names = get_test_suite_names_by_table_group_names(table_group_names) + can_be_deleted = not table_group_has_dependencies(schema, table_group_names, test_suite_names) + if not dry_run: + test_suite_service.cascade_delete(test_suite_names) + table_group_queries.cascade_delete(schema, table_group_names) + return can_be_deleted - # TODO: avoid deletion of used table groups - # usage_result = table_group_queries.get_table_group_usage(schema, table_group_ids, table_group_names) - # can_be_deleted = usage_result.empty - can_be_deleted = True - if not dry_run and can_be_deleted: - table_group_queries.delete(schema, table_group_ids) - return can_be_deleted +def table_group_has_dependencies(schema, table_group_names, test_suite_names): + test_suite_usage_result = test_suite_service.has_test_suite_dependencies(schema, test_suite_names) + if not table_group_names: + table_group_usage_result = False + else: + table_group_usage_result = not table_group_queries.get_table_group_dependencies(schema, table_group_names).empty + return test_suite_usage_result or table_group_usage_result + + +def are_table_groups_in_use(table_group_names): + if not table_group_names: + return False + + schema = st.session_state["dbschema"] + + test_suite_names = get_test_suite_names_by_table_group_names(table_group_names) + test_suites_in_use = test_suite_service.are_test_suites_in_use(test_suite_names) + + table_groups_in_use_result = table_group_queries.get_table_group_usage(schema, table_group_names) + table_groups_in_use = not table_groups_in_use_result.empty + + return test_suites_in_use or table_groups_in_use + + +def get_test_suite_names_by_table_group_names(table_group_names): + if not table_group_names: + return [] + schema = st.session_state["dbschema"] + test_suite_names = table_group_queries.get_test_suite_names_by_table_group_names(schema, table_group_names) + return test_suite_names.to_dict()["test_suite"].values() def test_table_group(table_group, connection_id, project_code): diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py index 4d51b59..aa33196 100644 --- a/testgen/ui/services/test_definition_service.py +++ b/testgen/ui/services/test_definition_service.py @@ -4,6 +4,7 @@ import testgen.ui.services.connection_service as connection_service import testgen.ui.services.database_service as database_service import testgen.ui.services.table_group_service as table_group_service +import testgen.ui.services.test_run_service as test_run_service def update_attribute(test_definition_ids, attribute, value): @@ -30,6 +31,12 @@ def delete(test_definition_ids, dry_run=False): return can_be_deleted +def cascade_delete(test_suite_names): + schema = st.session_state["dbschema"] + test_run_service.cascade_delete(test_suite_names) + test_definition_queries.cascade_delete(schema, test_suite_names) + + def add(test_definition): schema = st.session_state["dbschema"] prepare_to_persist(test_definition) diff --git a/testgen/ui/services/test_run_service.py b/testgen/ui/services/test_run_service.py new file mode 100644 index 0000000..6833668 --- /dev/null +++ b/testgen/ui/services/test_run_service.py @@ -0,0 +1,8 @@ +import streamlit as st + +import testgen.ui.queries.test_run_queries as test_run_queries + + +def cascade_delete(test_suite_names): + schema = st.session_state["dbschema"] + test_run_queries.cascade_delete(schema, test_suite_names) diff --git a/testgen/ui/services/test_suite_service.py b/testgen/ui/services/test_suite_service.py index 20b2899..b7b44a6 100644 --- a/testgen/ui/services/test_suite_service.py +++ b/testgen/ui/services/test_suite_service.py @@ -1,6 +1,7 @@ import streamlit as st import testgen.ui.queries.test_suite_queries as test_suite_queries +import testgen.ui.services.test_definition_service as test_definition_service def get_by_table_group(project_code, table_group_id): @@ -18,10 +19,26 @@ def add(test_suite): test_suite_queries.add(schema, test_suite) -def delete(test_suite_ids, test_suite_names, dry_run=False): +def cascade_delete(test_suite_names, dry_run=False): + if not test_suite_names: + return True schema = st.session_state["dbschema"] - usage_result = test_suite_queries.get_test_suite_usage(schema, test_suite_names) - can_be_deleted = usage_result.empty - if not dry_run and can_be_deleted: - test_suite_queries.delete(schema, test_suite_ids) + can_be_deleted = not has_test_suite_dependencies(schema, test_suite_names) + if not dry_run: + test_definition_service.cascade_delete(test_suite_names) + test_suite_queries.cascade_delete(schema, test_suite_names) return can_be_deleted + + +def has_test_suite_dependencies(schema, test_suite_names): + if not test_suite_names: + return False + return not test_suite_queries.get_test_suite_dependencies(schema, test_suite_names).empty + + +def are_test_suites_in_use(test_suite_names): + if not test_suite_names: + return False + schema = st.session_state["dbschema"] + usage_result = test_suite_queries.get_test_suite_usage(schema, test_suite_names) + return not usage_result.empty diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 74dec47..1d7459f 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -228,7 +228,7 @@ def show_delete_modal(modal, selected=None): table_group_id = selected_table_group["id"] table_group_name = selected_table_group["table_groups_name"] - can_be_deleted = table_group_service.delete([table_group_id], [table_group_name], dry_run=True) + can_be_deleted = table_group_service.cascade_delete([table_group_name], dry_run=True) fm.render_html_list( selected_table_group, @@ -241,20 +241,29 @@ def show_delete_modal(modal, selected=None): int_data_width=700, ) + if not can_be_deleted: + st.markdown( + ":orange[This Table Group has related data, which may include profiling, test definitions and test results. If you proceed, all related data will be permanently deleted.
Are you sure you want to proceed?]", + unsafe_allow_html=True, + ) + accept_cascade_delete = st.toggle("I accept deletion of this Table Group and all related TestGen data.") + with st.form("Delete Table Group", clear_on_submit=True): - disable_delete_button = authentication_service.current_user_has_read_role() or not can_be_deleted + disable_delete_button = authentication_service.current_user_has_read_role() or ( + not can_be_deleted and not accept_cascade_delete + ) delete = st.form_submit_button("Delete", disabled=disable_delete_button) if delete: - table_group_service.delete([table_group_id], [table_group_name]) - success_message = f"Table Group {table_group_name} has been deleted. " - st.success(success_message) - time.sleep(1) - modal.close() - st.experimental_rerun() - - if not can_be_deleted: - st.markdown(":orange[This Table Group cannot be deleted because it is being used in existing tests.]") + if table_group_service.are_table_groups_in_use([table_group_name]): + st.error("This Table Group is in use by a running process and cannot be deleted.") + else: + table_group_service.cascade_delete([table_group_name]) + success_message = f"Table Group {table_group_name} has been deleted. " + st.success(success_message) + time.sleep(1) + modal.close() + st.experimental_rerun() def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None): diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 5e19d88..7ccb39d 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -262,10 +262,9 @@ def show_delete_modal(modal, selected=None): selected_test_suite = selected[0] with modal.container(): - test_suite_id = selected_test_suite["id"] test_suite_name = selected_test_suite["test_suite"] - can_be_deleted = test_suite_service.delete([test_suite_id], [test_suite_name], dry_run=True) + can_be_deleted = test_suite_service.cascade_delete([test_suite_name], dry_run=True) fm.render_html_list( selected_test_suite, @@ -278,20 +277,30 @@ def show_delete_modal(modal, selected=None): int_data_width=700, ) + + if not can_be_deleted: + st.markdown( + ":orange[This Test Suite has related data, which includes test definitions and may include test results. If you proceed, all related data will be permanently deleted.
Are you sure you want to proceed?]", + unsafe_allow_html=True, + ) + accept_cascade_delete = st.toggle("I accept deletion of this Test Suite and all related TestGen data.") + with st.form("Delete Test Suite", clear_on_submit=True): - disable_delete_button = authentication_service.current_user_has_read_role() or not can_be_deleted + disable_delete_button = authentication_service.current_user_has_read_role() or ( + not can_be_deleted and not accept_cascade_delete + ) delete = st.form_submit_button("Delete", disabled=disable_delete_button) if delete: - test_suite_service.delete([test_suite_id], [test_suite_name]) - success_message = f"Test Suite {test_suite_name} has been deleted. " - st.success(success_message) - time.sleep(1) - modal.close() - st.experimental_rerun() - - if not can_be_deleted: - st.markdown(":orange[This Test Suite cannot be deleted because it is being used in existing tests.]") + if test_suite_service.are_test_suites_in_use([test_suite_name]): + st.error("This Test Suite is in use by a running process and cannot be deleted.") + else: + test_suite_service.cascade_delete([test_suite_name]) + success_message = f"Test Suite {test_suite_name} has been deleted. " + st.success(success_message) + time.sleep(1) + modal.close() + st.experimental_rerun() def show_add_or_edit_modal(modal, mode, project_code, connection, table_group, selected=None): From e8e7d017f60d6d5ba2335e680910ff51047a6c67 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Wed, 1 May 2024 18:37:07 -0400 Subject: [PATCH 05/19] fix(redshift): handle all null target column Fixes two test types in redshift flavor that were failing for target columns where all values are null. --- testgen/template/dbsetup/050_populate_new_schema_metadata.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index ad7c453..81c2f29 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -227,8 +227,8 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('1016', 'Min_Val', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1017', 'Missing_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('1018', 'Monthly_Rec_Ct', 'redshift', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), - ('1019', 'Outlier_Pct_Above', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('1020', 'Outlier_Pct_Below', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('1019', 'Outlier_Pct_Above', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('1020', 'Outlier_Pct_Below', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), ('1021', 'Pattern_Match', 'redshift', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM((NULLIF({COLUMN_NAME}, '''') SIMILAR TO ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'), ('1022', 'Recency', 'redshift', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), ('1023', 'Required', 'redshift', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), From 7cf3e5961b98cb7039786b0431ad65d6729618c2 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Wed, 1 May 2024 18:42:13 -0400 Subject: [PATCH 06/19] fix(synapse): add autocommit to connection params --- testgen/common/database/flavor/mssql_flavor_service.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index b45855a..f43860d 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -24,6 +24,9 @@ def get_connection_string_from_fields(self, dctCredentials, strPW): "&autocommit=True" ) + if "synapse" in hostname: + strConnect += "&autocommit=True" + return strConnect def get_pre_connection_queries(self, dctCredentials): # noqa ARG002 @@ -34,7 +37,3 @@ def get_pre_connection_queries(self, dctCredentials): # noqa ARG002 def get_concat_operator(self): return "+" - - def get_connect_args(self): - return {} - # return {"pool_pre_ping": "True"} From 847834f0946fbafa0d5ac10c12e50df97b097aba Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Wed, 1 May 2024 19:01:17 -0400 Subject: [PATCH 07/19] refactor(table groups): preview dialog visuals --- testgen/ui/views/table_groups.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 1d7459f..efd2119 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -269,7 +269,7 @@ def show_delete_modal(modal, selected=None): def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None): connection_id = connection["connection_id"] with modal.container(): - table_groups_settings_tab, table_groups_preview_tab = st.tabs(["Table Group Settings", "Preview"]) + table_groups_settings_tab, table_groups_preview_tab = st.tabs(["Table Group Settings", "Test"]) with table_groups_settings_tab: selected_table_group = selected[0] if mode == "edit" else None @@ -400,25 +400,18 @@ def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None) modal.close() st.experimental_rerun() - if mode == "edit": - bottom_left_column, bottom_right_column = st.columns([0.5, 0.5]) - test = bottom_left_column.button("Test") - status_form = bottom_right_column.empty() - if test: - table_group_preview(entity, connection_id, project_code, status_form, show_results=False) - with table_groups_preview_tab: if mode == "edit": preview_left_column, preview_right_column = st.columns([0.5, 0.5]) status_preview = preview_right_column.empty() - preview = preview_left_column.button("Preview Table Group") + preview = preview_left_column.button("Test Table Group") if preview: - table_group_preview(entity, connection_id, project_code, status_preview, show_results=True) + table_group_preview(entity, connection_id, project_code, status_preview) else: st.write("No preview available while adding a Table Group. Save the configuration first.") -def table_group_preview(entity, connection_id, project_code, status, show_results=False): +def table_group_preview(entity, connection_id, project_code, status): status.empty() status.info("Connecting to the Table Group ...") try: @@ -432,8 +425,7 @@ def table_group_preview(entity, connection_id, project_code, status, show_result tables.add(result["table_name"]) columns.append(result["column_name"]) - if show_results: - show_test_results(schemas, tables, columns, qc_results) + show_test_results(schemas, tables, columns, qc_results) status.empty() status.success("Operation has finished successfully.") @@ -445,14 +437,12 @@ def table_group_preview(entity, connection_id, project_code, status, show_result error_message = "Result is empty." if not all(qc_results): error_message = f"Error testing the connection to the Table Group. Details: {qc_results}" - if show_results: - st.text_area("Table Group Error Details", value=error_message) + st.text_area("Table Group Error Details", value=error_message) except Exception as e: status.empty() - status.error("Error previewing the Table Group.") + status.error("Error testing the Table Group.") error_message = e.args[0] - if show_results: - st.text_area("Table Group Error Details", value=error_message) + st.text_area("Table Group Error Details", value=error_message) def show_test_results(schemas, tables, columns, qc_results): From 848c79c64919278a0b6cf440bdf10c603c35e44e Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Wed, 1 May 2024 19:07:56 -0400 Subject: [PATCH 08/19] feat(ui): add some refresh icons Add a refresh icon in the toolbars for Data Profile and Data Quality Testing pages. --- testgen/ui/views/profiling_summary.py | 14 +++++++++++--- testgen/ui/views/test_runs.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index f48b759..f10a06f 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -1,3 +1,4 @@ +from time import sleep import typing import streamlit as st @@ -37,7 +38,7 @@ def render(self) -> None: str_project = st.session_state["project"] # Setup Toolbar - tool_bar = tb.ToolBar(3, 2, 0, None) + tool_bar = tb.ToolBar(2, 3, 0, None) with tool_bar.long_slots[0]: # Table Groups selection -- optional criterion @@ -46,6 +47,13 @@ def render(self) -> None: "Table Group", df_tg, "table_groups_name", "id", boo_required=False, str_default=None ) + with tool_bar.short_slots[0]: + if st.button("⟳", help="Refresh the grid", key="refresh-button-profiling"): + st.cache_data.clear() + st.toast("Page Refreshed!") + sleep(1) + st.experimental_rerun() + df, show_columns = get_db_profiling_runs(str_project, str_table_groups_id) time_columns = ["start_time"] @@ -109,7 +117,7 @@ def open_drill_downs(dct_selected_rows, button_slots): if dct_selected_rows: dct_selected_row = dct_selected_rows[0] - if button_slots[0].button( + if button_slots[1].button( "Profiling Results →", help="Review profiling characteristics for each data column", use_container_width=True, @@ -120,7 +128,7 @@ def open_drill_downs(dct_selected_rows, button_slots): session.current_page_args = {} st.experimental_rerun() - if button_slots[1].button( + if button_slots[2].button( "Anomalies →", help="Review potential data problems identified in profiling", use_container_width=True, diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 252fa9e..b624004 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -1,3 +1,4 @@ +from time import sleep import typing import streamlit as st @@ -36,7 +37,7 @@ def render(self) -> None: str_project = st.session_state["project"] # Setup Toolbar - tool_bar = tb.ToolBar(4, 1, 0, None) + tool_bar = tb.ToolBar(4, 2, 0, None) with tool_bar.long_slots[0]: # Table Groups selection -- optional criterion @@ -45,6 +46,13 @@ def render(self) -> None: "Table Group", df_tg, "table_groups_name", "id", boo_required=False, str_default=None ) + with tool_bar.short_slots[0]: + if st.button("⟳", help="Refresh the grid", key="refresh-button-test-runs"): + st.cache_data.clear() + st.toast("Page Refreshed!") + sleep(1) + st.experimental_rerun() + with tool_bar.long_slots[1]: # Table Groups selection -- optional criterion df_ts = get_db_test_suite_choices(str_project, str_table_groups_id) @@ -60,7 +68,7 @@ def render(self) -> None: dct_selected_rows = fm.render_grid_select(df, show_columns) dct_selected_row = dct_selected_rows[0] if dct_selected_rows else None - if tool_bar.short_slots[0].button( + if tool_bar.short_slots[1].button( "Test Results →", help="Review test results for the selected run", use_container_width=True, From fb35177a98adbc6e42b809e418a479d10b58446f Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Thu, 2 May 2024 09:03:20 -0400 Subject: [PATCH 09/19] feat(profiling): add new profiling details view --- testgen/__main__.py | 11 +- .../queries/execute_cat_tests_query.py | 2 +- testgen/commands/run_execute_cat_tests.py | 13 +- testgen/commands/run_execute_tests.py | 22 +- .../030_initialize_new_schema_structure.sql | 1 + .../050_populate_new_schema_metadata.sql | 196 ++++--- .../project_profiling_query_mssql.yaml | 4 +- .../project_profiling_query_postgresql.yaml | 4 +- .../create_functions_postgresql.sql | 4 +- .../project_profiling_query_redshift.yaml | 4 +- .../create_functions_redshift.sql | 4 +- .../project_profiling_query_snowflake.yaml | 4 +- .../create_functions_snowflake.sql | 4 +- .../project_profiling_query_trino.yaml | 4 +- .../profiling/functional_datatype.sql | 66 ++- testgen/ui/queries/profiling_queries.py | 147 +++++ testgen/ui/queries/test_suite_queries.py | 32 ++ testgen/ui/services/connection_service.py | 4 +- testgen/ui/services/form_service.py | 47 +- testgen/ui/services/table_group_service.py | 2 +- testgen/ui/services/test_suite_service.py | 32 ++ testgen/ui/views/profiling_anomalies.py | 54 +- testgen/ui/views/profiling_details.py | 335 +++++++++++ testgen/ui/views/profiling_results.py | 542 +++--------------- testgen/ui/views/profiling_summary.py | 2 +- testgen/ui/views/test_definitions.py | 15 +- testgen/ui/views/test_results.py | 58 +- testgen/ui/views/test_runs.py | 2 +- testgen/ui/views/test_suites.py | 116 ++-- 29 files changed, 1047 insertions(+), 684 deletions(-) create mode 100644 testgen/ui/queries/profiling_queries.py create mode 100644 testgen/ui/views/profiling_details.py diff --git a/testgen/__main__.py b/testgen/__main__.py index b9f4ad1..6d389d2 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -137,10 +137,17 @@ def run_profile(configuration: Configuration, table_group_id: str): required=False, default=settings.DEFAULT_TEST_SUITE_KEY, ) +@click.option( + "-gs", + "--generation-set", + help="A defined subset of tests to generate for your purpose. Use a generation_set defined for your project.", + required=False, + default=None, +) @pass_configuration -def run_test_generation(configuration: Configuration, table_group_id: str, test_suite_key: str): +def run_test_generation(configuration: Configuration, table_group_id: str, test_suite_key: str, generation_set: str): LOG.info("CurrentStep: Generate Tests - Main Procedure") - message = run_test_gen_queries(table_group_id, test_suite_key) + message = run_test_gen_queries(table_group_id, test_suite_key, generation_set) LOG.info("Current Step: Generate Tests - Main Procedure Complete") display_service.echo("\n" + message) diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py index 552b32b..a6f3d01 100644 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ b/testgen/commands/queries/execute_cat_tests_query.py @@ -51,7 +51,7 @@ def _ReplaceParms(self, strInputString): strInputString = strInputString.replace( "{NOW}", date_service.get_now_as_string_with_offset(self.minutes_offset) ) - strInputString = strInputString.replace("{EXCEPTION_MESSAGE}", self.exception_message) + strInputString = strInputString.replace("{EXCEPTION_MESSAGE}", self.exception_message.strip()) for parm, value in self.dctTestParms.items(): strInputString = strInputString.replace("{" + parm.upper() + "}", str(value)) diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py index 166a89a..453747c 100644 --- a/testgen/commands/run_execute_cat_tests.py +++ b/testgen/commands/run_execute_cat_tests.py @@ -65,7 +65,9 @@ def FinalizeTestRun(clsCATExecute): RunActionQueryList(("DKTG"), lstQueries) -def run_cat_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset=0, spinner=None): +def run_cat_test_queries( + strTestRunID, strTestTime, strProjectCode, strTestSuite, error_msg, minutes_offset=0, spinner=None +): # PARAMETERS AND SET-UP booErrors = False LOG.info("CurrentStep: Retrieving Parameters") @@ -78,6 +80,7 @@ def run_cat_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite ) clsCATExecute.test_run_id = strTestRunID clsCATExecute.run_date = strTestTime + clsCATExecute.exception_message += error_msg # Set Project Connection Params in common.db_bridgers from retrieved params LOG.info("CurrentStep: Assigning Connection Parms") @@ -135,9 +138,9 @@ def run_cat_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite LOG.info("Test results successfully parsed.") if intErrors > 0: booErrors = True - LOG.warning( - f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) Please check log." - ) + cat_error_msg = f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) Please check log." + LOG.warning(cat_error_msg) + clsCATExecute.exception_message += cat_error_msg else: LOG.info("No valid tests were available to perform") @@ -145,7 +148,7 @@ def run_cat_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite booErrors = True sqlsplit = e.args[0].split("[SQL", 1) errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsCATExecute.exception_message = f"{type(e).__name__}: {errorline}" + clsCATExecute.exception_message += f"{type(e).__name__}: {errorline}" raise else: diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index 6da45d0..17e054e 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -22,6 +22,7 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset=0, spinner=None): booErrors = False + error_msg = "" LOG.info("CurrentStep: Retrieving TestExec Parameters") dctParms = RetrieveTestExecParms(strProjectCode, strTestSuite) @@ -76,7 +77,7 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi spinner.next() # Execute list, returning test results - LOG.info("CurrentStep: Executing Non-CAT Queries") + LOG.info("CurrentStep: Executing Non-CAT Test Queries") lstTestResults, colResultNames, intErrors = RunThreadedRetrievalQueryList( "PROJECT", lstTestQueries, dctParms["max_threads"], spinner ) @@ -87,9 +88,11 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi WriteListToDB("DKTG", lstTestResults, colResultNames, "test_results") if intErrors > 0: booErrors = True - LOG.warning( - f"Errors were encountered executing query tests. ({intErrors} errors occurred.) Please check log." + error_msg = ( + f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) " + "Please check log. " ) + LOG.warning(error_msg) else: LOG.info("No tests found") @@ -103,7 +106,7 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi raise else: - return booErrors + return booErrors, error_msg def run_execution_steps_in_background(strProjectCode, strTestSuite, minutes_offset=0): @@ -123,6 +126,8 @@ def run_execution_steps_in_background(strProjectCode, strTestSuite, minutes_offs def run_execution_steps(strProjectCode, strTestSuite, minutes_offset=0, spinner=None): # Initialize required parms for all three steps booErrors = False + error_msg = "" + strTestRunID = str(uuid.uuid4()) strTestTime = date_service.get_now_as_string_with_offset(minutes_offset) @@ -133,11 +138,14 @@ def run_execution_steps(strProjectCode, strTestSuite, minutes_offset=0, spinner= run_parameter_validation_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, True) LOG.info("CurrentStep: Execute Step - Test Execution") - if run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset, spinner): - booErrors = True + booErrors, error_msg = run_test_queries( + strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset, spinner + ) LOG.info("CurrentStep: Execute Step - CAT Test Execution") - if run_cat_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset, spinner): + if run_cat_test_queries( + strTestRunID, strTestTime, strProjectCode, strTestSuite, error_msg, minutes_offset, spinner + ): booErrors = True if booErrors: diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 1ec18b7..fbd5c86 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -452,6 +452,7 @@ CREATE TABLE test_types ( run_type VARCHAR(10), test_scope VARCHAR, dq_dimension VARCHAR(50), + health_dimension VARCHAR(50), threshold_description VARCHAR(200), usage_notes VARCHAR, active VARCHAR diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 81c2f29..acc5de1 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -16,11 +16,8 @@ INSERT INTO profile_anomaly_types (id, anomaly_type, data_object, anomaly_name, VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.'), - ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', 'p.filled_value_ct > 0 OR p.zero_length_ct > 0', '((((''Filled Values: '' || p.filled_value_ct::VARCHAR(10)) || '', Null: '') || - p.null_value_ct::VARCHAR(10)) || '', Empty String: '') || - p.zero_length_ct::VARCHAR(10)', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'), - ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR - || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.'), + ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', 'p.filled_value_ct > 0 OR p.zero_length_ct > 0', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'), + ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.'), ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.'), ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.'), ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.'), @@ -56,12 +53,12 @@ n controls over data ingested and to make values more efficient, consistent and AND (p.value_ct - p.zero_length_ct - p.filled_value_ct) < p.record_ct', '(p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::VARCHAR(20) || '' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' || ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::NUMERIC(18, 5) - / p.value_ct::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.'), + / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.'), ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', '(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / p.value_ct::FLOAT) > 97::FLOAT AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / - p.value_ct::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT - / p.value_ct::FLOAT)::VARCHAR(40) + NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT + / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40) || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.'), ('1015', 'Boolean_Value_Mismatch', 'Column', 'Unexpected Boolean Values Found', 'This column appears to contain boolean (True/False) data, but unexpected values were found. This could indicate inconsistent coding for the same intended values, potentially leading to downstream errors or inconsistent business logic. ', '(distinct_value_ct > 1 AND ((lower(top_freq_values) ILIKE ''| true |%'' OR lower(top_freq_values) ILIKE ''| false |%'') AND NOT (lower(top_freq_values) ILIKE ''%| true |%'' AND lower(top_freq_values) ILIKE ''%| false |%'')) @@ -83,18 +80,18 @@ n controls over data ingested and to make values more efficient, consistent and AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.'), ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', e'p.general_type = \'A\' - AND p.numeric_ct::FLOAT / p.record_ct::FLOAT < 0.03 + AND p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT < 0.03 AND p.numeric_ct > 0', - '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', + '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.'), ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'') AND SPLIT_PART(p.top_patterns, '' | '', 2) = ''NNN'' - AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/value_ct::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.'), + AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.'), ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.'), ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', - 'p.includes_digit_ct::FLOAT/p.value_ct::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', + 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.'), ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.'); @@ -105,52 +102,55 @@ TRUNCATE TABLE test_types; INSERT INTO test_types (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, default_parm_columns, default_parm_values, default_parm_prompts, - default_parm_help, default_severity, run_type, test_scope, dq_dimension, threshold_description, usage_notes, active) -VALUES ('1001', 'Aggregate_No_Drops', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Expected count of value combinations with lower or missing aggregate measure', NULL, 'N'), - ('1002', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Expected count of value combinations with not exceeding aggregate measure', NULL, 'N'), - ('1003', 'Aggregate_Match', 'Aggregate Match', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Expected count of value combinations with non-matching aggregate measure', NULL, 'N'), - ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), + default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, + usage_notes, active) +VALUES ('1001', 'Aggregate_No_Drops', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Data Drift', 'Expected count of value combinations with lower or missing aggregate measure', NULL, 'N'), + ('1002', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Data Drift', 'Expected count of value combinations with not exceeding aggregate measure', NULL, 'N'), + ('1003', 'Aggregate_Match', 'Aggregate Match', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of value combinations with non-matching aggregate measure', NULL, 'N'), + ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), + ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. -A query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), - ('1010', 'DATA MATCH', 'Combo Match', 'Column value combinations match reference', 'Tests for the presence of the same set of column values in a reference table', 'Column values don''t match reference table values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition', NULL, 'Record Subset Condition,Column Names to Match,Schema Name,Table Name,Match Schema Name,Match Table Name,Match Table Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Validity', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), - ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), - ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected count of invalid email addresses', NULL, 'Y'), - ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Expected count of future dates', NULL, 'Y'), - ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), - ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), - ('1018', 'LOV_All', 'All Values', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), - ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), - ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), - ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), - ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), - ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), - ('1027', 'PRIOR MATCH', 'Prior Match', 'Column value combinations match prior reference', 'Tests that the same set of column values are present in the current dataset as a different, prior schema.', 'Column values don''t match prior schema values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_schema_name', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), - ('1029', 'RELATIVE ENTROPY', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'subset_condition,groupby_names,match_schema_name,match_subset_condition,match_groupby_names,threshold_value', NULL, 'Standardized Divergence Measure (0 to 1)', NULL, 'Warning', 'QUERY', 'multi-column', 'Consistency', 'Expected maximum divergence level between 0 and 1', NULL, 'N'), - ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Expected count of missing values', NULL, 'Y'), - ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), - ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), - ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), - ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), - ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), - ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), - ('1038', 'WINDOW MATCH NO DROPS', 'Timeframe Minimum', 'Latest timeframe includes all values in prior timeframe', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Expected count of missing value combinations', NULL, 'N'), - ('1039', 'WINDOW MATCH SAME', 'Timeframe Match', 'Values in latest timeframe all found in prior timeframe', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), - ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), - ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Expected count of invalid months', NULL, 'N'); - +A query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), + ('1010', 'DATA MATCH', 'Combo Match', 'Column value combinations match reference', 'Tests for the presence of the same set of column values in a reference table', 'Column values don''t match reference table values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition', NULL, 'Record Subset Condition,Column Names to Match,Schema Name,Table Name,Match Schema Name,Match Table Name,Match Table Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', NULL, 'N'), + ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), + ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), + ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), + ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), + ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), + ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), + ('1018', 'LOV_All', 'All Values', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), + ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), + ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), + ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), + ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), + ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), + ('1027', 'PRIOR MATCH', 'Prior Match', 'Column value combinations match prior reference', 'Tests that the same set of column values are present in the current dataset as a different, prior schema.', 'Column values don''t match prior schema values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_schema_name', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'N'), + ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), + ('1029', 'RELATIVE ENTROPY', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'subset_condition,groupby_names,match_schema_name,match_subset_condition,match_groupby_names,threshold_value', NULL, 'Standardized Divergence Measure (0 to 1)', NULL, 'Warning', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', NULL, 'N'), + ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), + ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), + ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), + ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), + ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), + ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), + ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), + ('1038', 'WINDOW MATCH NO DROPS', 'Timeframe Minimum', 'Latest timeframe includes all values in prior timeframe', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', NULL, 'N'), + ('1039', 'WINDOW MATCH SAME', 'Timeframe Match', 'Values in latest timeframe all found in prior timeframe', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'N'), + ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), + ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), + ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), + ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), + ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), + ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'); TRUNCATE TABLE generation_sets; @@ -337,11 +337,6 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('3032', 'Variability_Decrease', 'mssql', '100.0*STDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'), ('4031', 'Variability_Increase', 'postgresql', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'), ('4032', 'Variability_Decrease', 'postgresql', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'), - ('1033', 'Valid_Month', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2033', 'Valid_Month', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3033', 'Valid_Month', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4033', 'Valid_Month', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5033', 'Valid_Month', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5001', 'Alpha_Trunc', 'trino', 'MAX(LENGTH({COLUMN_NAME}))', '<', '{THRESHOLD_VALUE}'), ('5002', 'Avg_Shift', 'trino', 'ABS( (CAST(AVG({COLUMN_NAME} AS REAL)) - {BASELINE_AVG}) / SQRT(((CAST(COUNT({COLUMN_NAME}) AS REAL)-1)*STDDEV({COLUMN_NAME})^2 + (CAST({BASELINE_VALUE_CT} AS REAL)-1) * CAST({BASELINE_SD} AS REAL)^2) /NULLIF(CAST(COUNT({COLUMN_NAME}) AS REAL) + CAST({BASELINE_VALUE_CT} AS REAL), 0) ))', '>=', '{THRESHOLD_VALUE}'), @@ -374,7 +369,31 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('5029', 'Unique_Pct', 'trino', 'ABS( 2.0 * ASIN( SQRT(CAST({BASELINE_UNIQUE_CT} AS REAL) / CAST({BASELINE_VALUE_CT} AS REAL) ) ) - 2 * ASIN( SQRT( CAST(COUNT( DISTINCT {COLUMN_NAME} ) AS REAL) / CAST(NULLIF(COUNT( {COLUMN_NAME} ), 0) AS REAL) )))', '>=', '{THRESHOLD_VALUE}'), ('5030', 'Weekly_Rec_Ct', 'trino', 'MAX(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), "{COLUMN_NAME}")) - MIN(DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), "{COLUMN_NAME}")) +1 - COUNT(DISTINCT DATE_DIFF(''week'', CAST(''1800-01-01'' AS DATE), "{COLUMN_NAME}"))', '>', '{THRESHOLD_VALUE}'), ('5031', 'Variability_Increase', 'trino', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS REAL))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'), - ('5032', 'Variability_Decrease', 'trino', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS REAL))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'); + ('5032', 'Variability_Decrease', 'trino', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS REAL))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'), + + ('1033', 'Valid_Month', 'redshift', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2033', 'Valid_Month', 'snowflake', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3033', 'Valid_Month', 'mssql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4033', 'Valid_Month', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5033', 'Valid_Month', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + + ('1034', 'Valid_US_Zip', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4034', 'Valid_US_Zip', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2034', 'Valid_US_Zip', 'snowflake', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5034', 'Valid_US_Zip', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3034', 'Valid_US_Zip', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9]'' OR {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'' OR {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'' THEN 0 ELSE 1 END)', '>', '{THRESHOLD_VALUE}'), + + ('1035', 'Valid_US_Zip3', 'redshift', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4035', 'Valid_US_Zip3', 'postgresql', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2035', 'Valid_US_Zip3', 'snowflake', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5035', 'Valid_US_Zip3', 'trino', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3035', 'Valid_US_Zip3', 'mssql', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + + ('1036', 'Valid_Characters', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} ~ ''[[:cntrl:]]'' OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4036', 'Valid_Characters', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} ~ ''[[:cntrl:]]'' OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME}::VARCHAR LIKE ''''''%'''''' OR column_name::VARCHAR LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2036', 'Valid_Characters', 'snowflake', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''.*[[:cntrl:]].*'') OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5036', 'Valid_Characters', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}, ''[\x00-\x1F\x7F]'') OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR column_name LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3036', 'Valid_Characters', 'mssql', 'SUM(CASE WHEN PATINDEX(''%['' + CHAR(1) + ''-'' + CHAR(8) + CHAR(11) + CHAR(12) + CHAR(14) + ''-'' + CHAR(31) + '']%'', {COLUMN_NAME}) > 0 OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR column_name LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'); TRUNCATE TABLE target_data_lookups; @@ -415,7 +434,7 @@ VALUES ('1031', '1040', 'Test Results', 'Variability_Increase', 'redshift', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), ('1032', '1041', 'Test Results', 'Variability_Decrease', 'redshift', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1033', '1001', 'Profile Anomaly' , 'Suggested_Type', 'redshift', NULL, ''), + ('1033', '1001', 'Profile Anomaly' , 'Suggested_Type', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1034', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1035', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (len("{COLUMN_NAME}") >= 1 and len("{COLUMN_NAME}") <= 4) OR (len("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1036', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'), @@ -441,14 +460,14 @@ VALUES ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;' ), ('1057', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\\s(and|but|or|yet)\\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), - ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, ''), + ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), ('1059', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1050', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LENGTH("{COLUMN_NAME}") >= 1 AND LENGTH("{COLUMN_NAME}") <= 4) OR (LENGTH("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'', ''error'', ''missing'', ''tbd'', ''n/a'', ''#na'', ''none'', ''null'' , ''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'', ''(error)'', ''(missing)'', ''(tbd)'', ''(n/a)'', ''(#na)'', ''(none)'', ''(null)'', ''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'', ''[error]'', ''[missing]'', ''[tbd]'', ''[n/a]'', ''[#na]'', ''[none]'', ''[null]'' , ''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; '), ('1061', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1062', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1063', '1006', 'Profile Anomaly' , 'No_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1064', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), - ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type;' ), + ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY columns.table_name;' ), ('1066', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), @@ -501,31 +520,31 @@ VALUES ('1113', '1040', 'Test Results', 'Variability_Increase', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), ('1114', '1041', 'Test Results', 'Variability_Decrease', 'postgresql', NULL, 'SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1115', '1001', 'Profile Anomaly' , 'Suggested_Type', 'mssql', NULL, ''), - ('1116', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'') OR "{COLUMN_NAME}" LIKE '' '' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), + ('1115', '1001', 'Profile Anomaly' , 'Suggested_Type', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1116', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'') OR "{COLUMN_NAME}" LIKE '' '' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1117', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LEN("{COLUMN_NAME}") >= 1 AND LEN("{COLUMN_NAME}") <= 4) OR (LEN("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1118', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'mssql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1119', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'mssql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), - ('1120', '1006', 'Profile Anomaly' , 'No_Values', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1118', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), + ('1119', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), + ('1120', '1006', 'Profile Anomaly' , 'No_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1121', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ), ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ), - ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1126', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1127', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1128', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1129', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'mssql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), - ('1130', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;' ), + ('1127', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), + ('1128', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), + ('1129', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), + ('1130', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC;' ), ('1131', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'mssql', NULL, 'WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",'' '''''''',.-'',REPLICATE('' '', LEN('' '''''''',.-''))),'' '','''')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC;' ), - ('1132', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", CAST( ''{PROFILE_RUN_DATE}'' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST(''1900-01-01'' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST(''{PROFILE_RUN_DATE}'' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), + ('1132', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", CAST( ''{PROFILE_RUN_DATE}'' AS DATE) AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE ("{COLUMN_NAME}" < CAST(''1900-01-01'' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST(''{PROFILE_RUN_DATE}'' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), ('1133', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'mssql', NULL, 'created_in_ui' ), ('1134', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'mssql', NULL, 'created_in_ui' ), - ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), - ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), + ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), + ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ), - ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), + ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1140', '1004', 'Test Results', 'Alpha_Trunc', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;'), ('1141', '1005', 'Test Results', 'Avg_Shift', 'mssql', NULL, 'SELECT AVG(CAST("{COLUMN_NAME}" AS FLOAT)) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'), @@ -680,7 +699,7 @@ ORDER BY check_period DESC;'), ('1170', '1040', 'Test Results', 'Variability_Increase', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), ('1171', '1041', 'Test Results', 'Variability_Decrease', 'mssql', NULL, 'SELECT STDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'), - ('1172', '1001', 'Profile Anomaly' , 'Suggested_Type', 'snowflake', NULL, ''), + ('1172', '1001', 'Profile Anomaly' , 'Suggested_Type', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1173', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1174', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LEN("{COLUMN_NAME}") >= 1 AND LEN("{COLUMN_NAME}") <= 4) OR (LEN("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1175', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), @@ -742,7 +761,20 @@ ORDER BY check_period DESC;'), ('1229', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'redshift', NULL, 'WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING (''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', POSITION('':'', ''{DETAIL_EXPRESSION}'') + 2), ''|''))) ) GROUP BY "{COLUMN_NAME}";'), ('1231', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', CHARINDEX('':'', ''{DETAIL_EXPRESSION}'') + 2, 999), ''|'')) GROUP BY "{COLUMN_NAME}";'), - ('1232', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) GROUP BY "{COLUMN_NAME}";'); + ('1232', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) GROUP BY "{COLUMN_NAME}";'), + + ('1233', '1043', 'Test Results', 'Valid_Characters', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''[[:cntrl:]]'' OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1234', '1043', 'Test Results', 'Valid_Characters', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''[[:cntrl:]]'' OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR column_name LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1235', '1043', 'Test Results', 'Valid_Characters', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE PATINDEX(''%['' + CHAR(1) + ''-'' + CHAR(8) + CHAR(11) + CHAR(12) + CHAR(14) + ''-'' + CHAR(31) + '']%'', "{COLUMN_NAME}") > 0 OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR column_name LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1236', '1043', 'Test Results', 'Valid_Characters', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}", ''.*[[:cntrl:]].*'') OR "{COLUMN_NAME}"::VARCHAR LIKE '' %'' OR "{COLUMN_NAME}"::VARCHAR LIKE ''''''%'''''' OR "{COLUMN_NAME}"::VARCHAR LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1237', '1044', 'Test Results', 'Valid_US_Zip', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT SIMILAR TO ''([0-9]{5} |[0-9]{5}-[0-9]{4}|[0-9]{9})'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1238', '1044', 'Test Results', 'Valid_US_Zip', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1239', '1044', 'Test Results', 'Valid_US_Zip', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9]'' OR "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'' OR "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1240', '1044', 'Test Results', 'Valid_US_Zip', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'); TRUNCATE TABLE variant_codings; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index bc2b0fa..ac0a170 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -36,8 +36,8 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 END ) as quoted_value_ct, - SUM( CASE WHEN "{COL_NAME}" LIKE '%[0-9]%' THEN 1 END ) as includes_digit_ct, + SUM( CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, + SUM( CASE WHEN "{COL_NAME}" LIKE '%[0-9]%' THEN 1 ELSE 0 END ) as includes_digit_ct, SUM( CASE WHEN "{COL_NAME}" IN ('.', '?') OR "{COL_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COL_NAME}") > 1 diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index bc138a6..edad7c7 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -36,8 +36,8 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct, + SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, + SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 ELSE 0 END ) as includes_digit_ct, SUM( CASE WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 diff --git a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql b/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql index 380aacc..c78c430 100644 --- a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql +++ b/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql @@ -65,9 +65,7 @@ OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) AS $$ SELECT CASE - WHEN $1 ~ '^[0-9]+(\.[0-9]+)?$' THEN 1 - WHEN $1 ~ '\\$[0-9]+(\.[0-9]+)?$' THEN 1 - WHEN $1 ~ '^[0-9]+(\.[0-9]+)?\\$' THEN 1 + WHEN $1 ~ E'^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 ELSE 0 END; $$ diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index 1b07346..d7f0fee 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -36,8 +36,8 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct, + SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, + SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 ELSE 0 END ) as includes_digit_ct, SUM( CASE WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 diff --git a/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql b/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql index 57d2d97..0270a38 100644 --- a/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql +++ b/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql @@ -4,9 +4,7 @@ CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) AS $$ SELECT CASE - WHEN $1 ~ '^[0-9]+(\.[0-9]+)?$' THEN 1 - WHEN $1 ~ '\\$[0-9]+(\.[0-9]+)?$' THEN 1 - WHEN $1 ~ '^[0-9]+(\.[0-9]+)?\\$' THEN 1 + WHEN $1 ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 ELSE 0 END; $$ diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index 82ea58b..ca41312 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -36,8 +36,8 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}"::VARCHAR ILIKE '"%"' OR "{COL_NAME}"::VARCHAR ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*[0-9].*') THEN 1 END ) as includes_digit_ct, + SUM( CASE WHEN "{COL_NAME}"::VARCHAR ILIKE '"%"' OR "{COL_NAME}"::VARCHAR ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, + SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*[0-9].*') THEN 1 ELSE 0 END ) as includes_digit_ct, SUM( CASE WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COL_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '9{2,}' diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql index a57a8a1..ae26031 100644 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql +++ b/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql @@ -5,9 +5,7 @@ IMMUTABLE AS $$ SELECT CASE - WHEN REGEXP_COUNT(strparm::VARCHAR,'^[0-9]+(\.[0-9]+)?$') > 0 THEN 1 - WHEN REGEXP_COUNT(strparm::VARCHAR,'\\$[0-9]+(\.[0-9]+)?$') > 0 THEN 1 - WHEN REGEXP_COUNT(strparm::VARCHAR,'^[0-9]+(\.[0-9]+)?\\$') > 0 THEN 1 + WHEN REGEXP_LIKE(strparm::VARCHAR, '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$') THEN 1 ELSE 0 END $$; diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index 8586652..7967003 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -36,8 +36,8 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 END ) as quoted_value_ct, - SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}" , '[0-9]') = TRUE THEN 1 END ) as includes_digit_ct, + SUM( CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, + SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}" , '[0-9]') = TRUE THEN 1 ELSE 0 END ) as includes_digit_ct, SUM( CASE WHEN "{COL_NAME}" IN ('.', '?') THEN 1 WHEN REGEXP_LIKE(LOWER("{COL_NAME}") ,'(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)') = TRUE THEN 1 diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 0906f33..08adb66 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -124,23 +124,34 @@ SET functional_data_type = 'Period Year' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL AND (column_name ILIKE '%year%' OR column_name ILIKE '%yr%') - AND ( (min_length = 4 AND max_length = 4 AND min_text >= '1900' AND max_text <= '2200') - OR (min_value >= 1900 AND max_value <= 2200 AND COALESCE(SIGN(fractional_sum), 0) = 0) ); + AND ( (min_value >= 1900 + AND max_value <= DATE_PART('YEAR', NOW()) + 20 + AND COALESCE(fractional_sum, 0) = 0) + OR + (min_text >= '1900' + AND max_text <= (DATE_PART('YEAR', NOW()) + 20)::VARCHAR + AND avg_length = 4 + AND avg_embedded_spaces = 0) + ); UPDATE profile_results SET functional_data_type = 'Period Quarter' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND avg_length BETWEEN 6 and 7 - AND (column_name ILIKE '%qtr%' or column_name ILIKE '%quarter%') - AND min_text >= '1900' AND max_text <= '2200' - AND SPLIT_PART(top_patterns, '|', 2) ~ '^\s*NNNN[-_]AN\s*$'; + AND (column_name ILIKE '%qtr%' or column_name ILIKE '%quarter%') + AND ( (min_value = 1 AND max_value = 4 + AND COALESCE(fractional_sum, 0) = 0) + OR + (min_text >= '1900' AND max_text <= '2200' + AND avg_length BETWEEN 6 and 7 + AND SPLIT_PART(top_patterns, '|', 2) ~ '^\s*NNNN[-_]AN\s*$') + ); UPDATE profile_results SET functional_data_type = 'Period Month' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ILIKE '%mon%' + AND column_name ILIKE '%mo%' AND ( (max_length = 2 AND (min_text = '01' OR min_text = '1') AND max_text = '12') OR (min_value = 1 AND max_value = 12 AND COALESCE(SIGN(fractional_sum), 0) = 0) @@ -214,13 +225,13 @@ UPDATE profile_results SET functional_data_type = 'Person Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(approver|first|last|full|contact|emp|employee|manager|mgr_|middle|nick|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; + AND column_name ~ '^(approver|first|last|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; UPDATE profile_results SET functional_data_type = 'Entity Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(acct|account|affiliation|branch|business|co|comp|company|corp|corporate|cust|customer|distributor|employer|entity|firm|franchise||org|organization|supplier|vendor|hospital|practice|clinic)(_| |)name$'; + AND column_name ~ '^(|acct|account|affiliation|branch|business|co|comp|company|corp|corporate|cust|customer|distributor|employer|entity|firm|franchise|hco|org|organization|supplier|vendor|hospital|practice|clinic)(_| |)name$'; -- 4. Assign CODE, CATEGORY, ID, ATTRIBUTE & DESCRIPTION /* @@ -318,8 +329,18 @@ SET functional_data_type = WHEN (max_value - min_value + 1 = distinct_value_ct) AND (fractional_sum IS NULL OR fractional_sum > 0) THEN 'Sequence' WHEN general_type='N' - AND ( - column_type ILIKE '%int%' + AND column_name SIMILAR TO '%(no|num|number|nbr)' + AND (column_type ILIKE '%int%' + OR + (RTRIM(SPLIT_PART(column_type, ',', 2), ')') > '0' + AND fractional_sum = 0) -- 0 implies integer; null is float or non-numeric + ) THEN + CASE + WHEN ROUND(100.0 * value_ct::FLOAT/NULLIF(record_ct, 0)) > 70 THEN 'ID' + ELSE 'Attribute-Numeric' + END + WHEN general_type='N' + AND ( column_type ILIKE '%int%' OR (RTRIM(SPLIT_PART(column_type, ',', 2), ')') > '0' AND fractional_sum = 0) -- 0 implies integer; null is float or non-numeric @@ -339,14 +360,24 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' -- 7. Assign 'ID-Unique' functional data type to the columns that are identity columns UPDATE profile_results -SET functional_data_type = CASE - WHEN record_ct = distinct_value_ct AND column_type IN ('smallint', 'integer', 'bigint') - AND record_ct > 50 THEN 'ID-Unique' - ELSE functional_data_type END +SET functional_data_type = 'ID-Unique' WHERE profile_run_id = '{PROFILE_RUN_ID}' - AND (functional_data_type = 'ID' - OR column_type IN ('smallint', 'integer', 'bigint') ); + AND functional_data_type IN ('ID', 'ID-Secondary') + AND record_ct = distinct_value_ct + AND record_ct > 50; + +-- Update alpha ID's to ID-Secondary and ID-Grouping +UPDATE profile_results +SET functional_data_type = CASE + WHEN ROUND(100.0 * value_ct::FLOAT/NULLIF(record_ct, 0)) > 70 + AND ROUND(100.0 * distinct_value_ct::FLOAT/NULLIF(value_ct, 0)) >= 75 THEN 'ID-Secondary' + WHEN ROUND(100.0 * value_ct::FLOAT/NULLIF(record_ct, 0)) > 70 + AND ROUND(100.0 * distinct_value_ct::FLOAT/NULLIF(value_ct, 0)) < 75 THEN 'ID-Group' + ELSE functional_data_type + END + WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type = 'ID'; -- 8. Assign 'ID-FK' functional data type to the columns that are foreign keys of the identity columns identified in the previous step @@ -362,6 +393,7 @@ WHERE profile_results.profile_run_id = '{PROFILE_RUN_ID}' and profile_results.table_name <> ui.table_name and profile_results.functional_data_type <> 'ID-Unique'; +-- Assign -- 9. Functional Data Type: 'Measurement Pct' diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py new file mode 100644 index 0000000..06b149e --- /dev/null +++ b/testgen/ui/queries/profiling_queries.py @@ -0,0 +1,147 @@ +import streamlit as st + +import testgen.ui.services.database_service as db +import testgen.ui.services.query_service as dq + + +@st.cache_data(show_spinner=False) +def run_table_groups_lookup_query(str_project_code): + str_schema = st.session_state["dbschema"] + return dq.run_table_groups_lookup_query(str_schema, str_project_code) + + +@st.cache_data(show_spinner=False) +def get_latest_profile_run(str_table_group): + str_schema = st.session_state["dbschema"] + str_sql = f""" + WITH last_profile_run + AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date + FROM {str_schema}.profiling_runs + GROUP BY table_groups_id) + SELECT id as profile_run_id + FROM {str_schema}.profiling_runs r + INNER JOIN last_profile_run l + ON (r.table_groups_id = l.table_groups_id + AND r.profiling_starttime = l.last_profile_run_date) + WHERE r.table_groups_id = '{str_table_group}'; +""" + str_profile_run_id = db.retrieve_single_result(str_sql) + + return str_profile_run_id + + +@st.cache_data(show_spinner=False) +def get_db_profile_run_choices(str_table_groups_id): + str_schema = st.session_state["dbschema"] + # Define the query + str_sql = f""" + SELECT DISTINCT profiling_starttime as profile_run_date, id + FROM {str_schema}.profiling_runs pr + WHERE pr.table_groups_id = '{str_table_groups_id}' + ORDER BY profiling_starttime DESC; + """ + # Retrieve and return data as df + return db.retrieve_data(str_sql) + + +@st.cache_data(show_spinner=False) +def run_table_lookup_query(str_table_groups_id): + str_schema = st.session_state["dbschema"] + str_sql = f""" + SELECT DISTINCT table_name + FROM {str_schema}.profile_results + WHERE table_groups_id = '{str_table_groups_id}'::UUID + ORDER BY table_name + """ + return db.retrieve_data(str_sql) + + +@st.cache_data(show_spinner=False) +def run_column_lookup_query(str_table_groups_id, str_table_name): + str_schema = st.session_state["dbschema"] + return dq.run_column_lookup_query(str_schema, str_table_groups_id, str_table_name) + + +@st.cache_data(show_spinner=False) +def lookup_db_parentage_from_run(str_profile_run_id): + str_schema = st.session_state["dbschema"] + # Define the query + str_sql = f""" + SELECT profiling_starttime as profile_run_date, g.table_groups_name + FROM {str_schema}.profiling_runs pr + INNER JOIN {str_schema}.table_groups g + ON pr.table_groups_id = g.id + WHERE pr.id = '{str_profile_run_id}' + """ + df = db.retrieve_data(str_sql) + if not df.empty: + return df.at[0, "profile_run_date"], df.at[0, "table_groups_name"] + + +@st.cache_data(show_spinner="Retrieving Data") +def get_profiling_detail(str_profile_run_id, str_table_name, str_column_name): + str_schema = st.session_state["dbschema"] + str_sql = f""" + SELECT -- Identifiers + id::VARCHAR, dk_id, + p.project_code, connection_id, p.table_groups_id::VARCHAR, + p.profile_run_id::VARCHAR, + run_date, sample_ratio, + -- Column basics + p.schema_name, p.table_name, position, p.column_name, + p.column_type, general_type as general_type_abbr, + CASE general_type + WHEN 'A' THEN 'Alpha' + WHEN 'N' THEN 'Numeric' + WHEN 'D' THEN 'Date' + WHEN 'T' THEN 'Time' + WHEN 'B' THEN 'Boolean' + ELSE 'N/A' + END as general_type, + functional_table_type, functional_data_type, + datatype_suggestion, + CASE WHEN s.column_name IS NOT NULL THEN 'Yes' END as anomalies, + -- Shared counts + record_ct, value_ct, distinct_value_ct, null_value_ct, + -- Shared except for B and X + min_length, max_length, avg_length, + -- Alpha counts + distinct_std_value_ct, + numeric_ct, date_ct, + filled_value_ct as dummy_value_ct, + CASE WHEN general_type = 'A' THEN COALESCE(zero_length_ct, 0) END as zero_length_ct, + CASE WHEN general_type = 'A' THEN COALESCE(lead_space_ct, 0) END as lead_space_ct, + CASE WHEN general_type = 'A' THEN COALESCE(quoted_value_ct, 0) END as quoted_value_ct, + CASE WHEN general_type = 'A' THEN COALESCE(includes_digit_ct, 0) END as includes_digit_ct, + CASE WHEN general_type = 'A' THEN COALESCE(embedded_space_ct, 0) END as embedded_space_ct, + avg_embedded_spaces, + min_text, max_text, + std_pattern_match, + top_patterns, + top_freq_values, distinct_value_hash, + distinct_pattern_ct, + -- A and N + zero_value_ct, + -- Numeric + min_value, min_value_over_0, max_value, + avg_value, stdev_value, percentile_25, percentile_50, percentile_75, + fractional_sum, + -- Dates + min_date, max_date, + before_1yr_date_ct, before_5yr_date_ct, within_1yr_date_ct, within_1mo_date_ct, future_date_ct, + date_days_present, date_weeks_present, date_months_present, + -- Boolean + boolean_true_ct + FROM {str_schema}.profile_results p + LEFT JOIN (SELECT DISTINCT profile_run_id, table_name, column_name + FROM {str_schema}.profile_anomaly_results) s + ON (p.profile_run_id = s.profile_run_id + AND p.table_name = s.table_name + AND p.column_name = s.column_name) + WHERE p.profile_run_id = '{str_profile_run_id}'::UUID + AND p.table_name ILIKE '{str_table_name}' + AND p.column_name ILIKE '{str_column_name}' + ORDER BY p.schema_name, p.table_name, position; + """ + + return db.retrieve_data(str_sql) diff --git a/testgen/ui/queries/test_suite_queries.py b/testgen/ui/queries/test_suite_queries.py index 7ea1c7b..b57e384 100644 --- a/testgen/ui/queries/test_suite_queries.py +++ b/testgen/ui/queries/test_suite_queries.py @@ -106,3 +106,35 @@ def get_test_suite_usage(schema: str, test_suite_names: list[str]) -> pd.DataFra select distinct test_suite from {schema}.test_runs where test_suite in ({",".join(test_suite_names_join)}) and status = 'Running' """ return db.retrieve_data(sql) + + +def get_test_suite_refresh_check(schema, test_suite_name): + sql = f""" + SELECT COUNT(*) as test_ct, + SUM(CASE WHEN lock_refresh = 'N' THEN 1 ELSE 0 END) as unlocked_test_ct, + SUM(CASE WHEN lock_refresh = 'N' AND last_manual_update IS NOT NULL THEN 1 ELSE 0 END) as unlocked_edits_ct + FROM {schema}.test_definitions + WHERE test_suite = '{test_suite_name}'; +""" + return db.retrieve_data_list(sql)[0] + + +def get_generation_sets(schema): + sql = f""" + SELECT DISTINCT generation_set + FROM {schema}.generation_sets + ORDER BY generation_set; +""" + return db.retrieve_data(sql) + + +def lock_edited_tests(schema, test_suite_name): + sql = f""" + UPDATE {schema}.test_definitions + SET lock_refresh = 'Y' + WHERE test_suite = '{test_suite_name}' + AND last_manual_update IS NOT NULL + AND lock_refresh = 'N'; +""" + db.execute_sql(sql) + return True diff --git a/testgen/ui/services/connection_service.py b/testgen/ui/services/connection_service.py index 0ada931..ae59a5b 100644 --- a/testgen/ui/services/connection_service.py +++ b/testgen/ui/services/connection_service.py @@ -1,9 +1,9 @@ import streamlit as st -from testgen.commands.run_profiling_bridge import InitializeProfilingSQL -from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools import testgen.ui.queries.connection_queries as connection_queries import testgen.ui.services.table_group_service as table_group_service +from testgen.commands.run_profiling_bridge import InitializeProfilingSQL +from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools from testgen.common.database.database_service import ( AssignConnectParms, RetrieveDBResultsToList, diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 692f5b2..f3c9a61 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -167,7 +167,9 @@ def render_widget(self, boo_form_display_only=False): raise ValueError(f"Widget {self.widget} is not supported.") -def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_columns=None, lst_column_headers=None): +def _generate_excel_export( + df_data, lst_export_columns, str_title=None, str_caption=None, lst_wrap_columns=None, lst_column_headers=None +): if lst_export_columns: # Filter the DataFrame to keep only the columns in lst_export_columns df_to_export = df_data[lst_export_columns] @@ -179,6 +181,7 @@ def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_colu if not str_title: str_title = "TestGen Data Export" + start_row = 4 if str_caption else 3 # Create a BytesIO buffer to hold the Excel file output = BytesIO() @@ -186,7 +189,7 @@ def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_colu # Create a Pandas Excel writer using XlsxWriter as the engine with pd.ExcelWriter(output, engine="xlsxwriter") as writer: # Write the DataFrame to an Excel file, starting from the fourth row - df_to_export.to_excel(writer, index=False, sheet_name="Sheet1", startrow=3) + df_to_export.to_excel(writer, index=False, sheet_name="Sheet1", startrow=start_row) # Access the XlsxWriter workbook and worksheet objects from the dataframe workbook = writer.book @@ -199,7 +202,11 @@ def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_colu else: column_settings = [{"header": column} for column in df_to_export.columns] worksheet.add_table( - 3, 0, max_row + 3, max_col - 1, {"columns": column_settings, "style": "Table Style Medium 16"} + start_row, + 0, + max_row + start_row, + max_col - 1, + {"columns": column_settings, "style": "Table Style Medium 16"}, ) # Define the format for wrapped text @@ -227,6 +234,11 @@ def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_colu # Write the title in cell A2 with formatting worksheet.write("A2", str_title, title_format) + if str_caption: + str_caption = str_caption.replace("{TIMESTAMP}", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + caption_format = workbook.add_format({"italic": True, "size": 9, "valign": "top"}) + worksheet.write("A3", str_caption, caption_format) + # Rewind the buffer output.seek(0) @@ -234,18 +246,29 @@ def _generate_excel_export(df_data, lst_export_columns, str_title, lst_wrap_colu return output.getvalue() -def render_excel_export(df, lst_export_columns, str_export_title, lst_wrap_columns=None, lst_column_headers=None): +def render_excel_export( + df, lst_export_columns, str_export_title=None, str_caption=None, lst_wrap_columns=None, lst_column_headers=None +): # Set up the download button st.download_button( label=":blue[**⤓**]", use_container_width=True, help="Download to Excel", - data=_generate_excel_export(df, lst_export_columns, str_export_title, lst_wrap_columns, lst_column_headers), + data=_generate_excel_export( + df, lst_export_columns, str_export_title, str_caption, lst_wrap_columns, lst_column_headers + ), file_name=f"{str_export_title}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) +def render_refresh_button(button_container): + with button_container: + do_refresh = st.button(":blue[**⟳**]", help="Refresh page data", use_container_width=False) + if do_refresh: + reset_post_updates("Refreshing page", True, True) + + def show_prompt(str_prompt=None): if str_prompt: st.markdown(f":blue[{str_prompt}]") @@ -345,7 +368,7 @@ def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_c st.toast(str_message) else: st.success(str_message) - sleep(1) + sleep(0.5) if clear_cache: if lst_cached_functions: @@ -356,9 +379,11 @@ def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_c st.experimental_rerun() -def render_page_header(str_page_title, str_help_link=None, str_description=None, lst_breadcrumbs=None): +def render_page_header( + str_page_title, str_help_link=None, str_description=None, lst_breadcrumbs=None, boo_show_refresh=False +): hcol1, hcol2 = st.columns([9, 1]) - hcol1.subheader(str_page_title) + hcol1.subheader(str_page_title, anchor=False) if str_help_link: with hcol2: st.caption(" ") @@ -377,7 +402,11 @@ def render_page_header(str_page_title, str_help_link=None, str_description=None, st.session_state["last_page"] = str_page_title if lst_breadcrumbs: - bcol1, bcol2 = st.columns([96, 4]) + if boo_show_refresh: + bcol1, bcol2, bcol3, _ = st.columns([875, 60, 60, 5]) + render_refresh_button(bcol3) + else: + bcol1, bcol2, _ = st.columns([95, 4, 1]) with bcol1: testgen.breadcrumbs(breadcrumbs=lst_breadcrumbs) return bcol2 diff --git a/testgen/ui/services/table_group_service.py b/testgen/ui/services/table_group_service.py index 3a1125e..598c163 100644 --- a/testgen/ui/services/table_group_service.py +++ b/testgen/ui/services/table_group_service.py @@ -1,9 +1,9 @@ import streamlit as st -from testgen.common.database.database_service import RetrieveDBResultsToDictList import testgen.ui.queries.table_group_queries as table_group_queries import testgen.ui.services.connection_service as connection_service import testgen.ui.services.test_suite_service as test_suite_service +from testgen.common.database.database_service import RetrieveDBResultsToDictList def get_by_id(table_group_id: str): diff --git a/testgen/ui/services/test_suite_service.py b/testgen/ui/services/test_suite_service.py index b7b44a6..45adc4e 100644 --- a/testgen/ui/services/test_suite_service.py +++ b/testgen/ui/services/test_suite_service.py @@ -42,3 +42,35 @@ def are_test_suites_in_use(test_suite_names): schema = st.session_state["dbschema"] usage_result = test_suite_queries.get_test_suite_usage(schema, test_suite_names) return not usage_result.empty + + +def get_test_suite_refresh_warning(test_suite_name): + if not test_suite_name: + return False + schema = st.session_state["dbschema"] + row_result = test_suite_queries.get_test_suite_refresh_check(schema, test_suite_name) + + test_ct = None + unlocked_test_ct = None + unlocked_edits_ct = None + if row_result: + test_ct = row_result["test_ct"] + unlocked_test_ct = row_result["unlocked_test_ct"] + unlocked_edits_ct = row_result["unlocked_edits_ct"] + + return test_ct, unlocked_test_ct, unlocked_edits_ct + + +def get_generation_set_choices(): + schema = st.session_state["dbschema"] + dfSets = test_suite_queries.get_generation_sets(schema) + if dfSets.empty: + return None + else: + return dfSets["generation_set"].to_list() + + +def lock_edited_tests(test_suite_name): + schema = st.session_state["dbschema"] + tests_locked = test_suite_queries.lock_edited_tests(schema, test_suite_name) + return tests_locked diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index d8a4e0c..5e13bc7 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -7,6 +7,9 @@ import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.toolbar_service as tb +import testgen.ui.queries.profiling_queries as profiling_queries +from testgen.ui.views.profiling_details import show_profiling_detail + from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.session import session @@ -109,7 +112,9 @@ def render(self) -> None: "suggested_action", ] lst_wrap_columns = ["anomaly_description", "suggested_action"] - fm.render_excel_export(df_pa, lst_export_columns, "Profiling Anomalies", lst_wrap_columns) + fm.render_excel_export( + df_pa, lst_export_columns, "Profiling Anomalies", "{TIMESTAMP}", lst_wrap_columns + ) if selected: # Always show details for last selected row @@ -139,7 +144,11 @@ def render(self) -> None: int_data_width=700, ) with col2: - _, v_col2 = st.columns([0.3, 0.7]) + # _, v_col2 = st.columns([0.3, 0.7]) + v_col1, v_col2 = st.columns([0.5, 0.5]) + view_profiling( + v_col1, selected_row["table_name"], selected_row["column_name"], str_profile_run_id + ) view_bad_data(v_col2, selected_row) # Need to render toolbar buttons after grid, so selection status is maintained @@ -195,26 +204,6 @@ def get_db_table_group_choices(str_project_code): return dq.run_table_groups_lookup_query(str_schema, str_project_code) -@st.cache_data(show_spinner=False) -def get_latest_profile_run(str_table_group): - str_schema = st.session_state["dbschema"] - str_sql = f""" - WITH last_profile_run - AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date - FROM profiling_runs - GROUP BY table_groups_id) - SELECT profile_run_id - FROM {str_schema}.profiling_runs r - INNER JOIN {str_schema}.last_profile_run l - ON (r.table_groups_id = l.table_groups_id - AND r.profiling_starttime = l.last_profile_run_date) - WHERE r.table_groups_id = '{str_table_group}'; -""" - str_profile_run_id = db.retrieve_single_result(str_sql) - - return str_profile_run_id - - @st.cache_data(show_spinner="Retrieving Data") def get_profiling_anomalies(str_profile_run_id, str_likelihood): str_schema = st.session_state["dbschema"] @@ -459,14 +448,12 @@ def view_bad_data(button_container, selected_row): with button_container: if st.button( - "Review Source Data →", help="Review source data for highlighted anomaly", use_container_width=True + ":green[Source Data →]", help="Review current source data for highlighted anomaly", use_container_width=True ): bad_data_modal.open() if bad_data_modal.is_open(): with bad_data_modal.container(): - # fm.show_subheader(str_header) - # fm.show_prompt(selected_row['anomaly_name']) fm.render_modal_header(selected_row["anomaly_name"], None) st.caption(selected_row["anomaly_description"]) fm.show_prompt(str_header) @@ -492,6 +479,23 @@ def view_bad_data(button_container, selected_row): st.dataframe(df_bad, height=500, width=1050, hide_index=True) +def view_profiling(button_container, str_table_name, str_column_name, str_profiling_run_id): + str_header = f"Column: {str_column_name}, Table: {str_table_name}" + + df = profiling_queries.get_profiling_detail(str_profiling_run_id, str_table_name, str_column_name) + + profiling_modal = testgen.Modal(title=None, key="dk-anomaly-profiling-modal", max_width=1100) + + with button_container: + if st.button(":green[Profiling →]", help="Review profiling for highlighted column", use_container_width=True): + profiling_modal.open() + + if profiling_modal.is_open(): + with profiling_modal.container(): + fm.render_modal_header(str_header, None) + show_profiling_detail(df.iloc[0], 300) + + def do_disposition_update(selected, str_new_status): str_result = None if selected: diff --git a/testgen/ui/views/profiling_details.py b/testgen/ui/views/profiling_details.py new file mode 100644 index 0000000..0fb8386 --- /dev/null +++ b/testgen/ui/views/profiling_details.py @@ -0,0 +1,335 @@ +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import streamlit as st + +import testgen.ui.services.database_service as db +import testgen.ui.services.form_service as fm + + +@st.cache_data(show_spinner="Retrieving Details") +def get_profile_screen(str_profile_run_id, str_table_name, str_column_name): + str_schema = st.session_state["dbschema"] + # Define the query + str_sql = f""" + SELECT pr.column_name, t.anomaly_name, replace(pr.detail, ' | ', ' ') as detail + FROM {str_schema}.profile_anomaly_results pr + INNER JOIN {str_schema}.profile_anomaly_types t + ON (pr.anomaly_id = t.id) + WHERE pr.profile_run_id = '{str_profile_run_id}'::UUID + AND pr.table_name = '{str_table_name}' + AND pr.column_name = '{str_column_name}' + AND t.anomaly_name <> 'Suggested Data Type' + ORDER BY anomaly_name; + """ + # Retrieve and return data as df + return db.retrieve_data(str_sql) + + +def reverse_count_category_pairs(input_str): + # Split the string by ' | ' to get individual elements + elements = input_str.split(" | ") + # Initialize an empty list to store reversed pairs + reversed_pairs = [] + display_pairs = [] + + # Loop to populate the list with reversed pairs + for i in range(0, len(elements), 2): + count = elements[i] + category = elements[i + 1] + + # Reverse count and category + reversed_pair = f"{category} | {count}" + reversed_pairs.append(reversed_pair) + # Reverse second version, for display on separate lines + display_pair = f"{category}: {count}" + display_pairs.append(display_pair) + + # Join the reversed pairs back into a single string + reversed_str = " | ".join(reversed_pairs) + + # Join the reversed pairs back into a single string + display_str = "
".join(display_pairs) + + return reversed_str, display_str + + +def write_profile_screen(selected_row): + df_screen = get_profile_screen( + selected_row["profile_run_id"], selected_row["table_name"], selected_row["column_name"] + ) + if not df_screen.empty: + with st.expander("**Profiling Anomalies**"): + # fm.render_markdown_table(df_screen, ["column_name", "anomaly_name", "detail"]) + st.dataframe(df_screen, use_container_width=True, hide_index=True) + + +def write_column_header(selected_row, form_data_width): + str_header = "Profiling Results" + lst_columns = [ + "column_name", + "table_name", + "schema_name", + "general_type", + "column_type", + "functional_data_type", + "datatype_suggestion", + ] + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_shared_header(selected_row, form_data_width): + str_header = "Data Overview" + # lst_columns = "record_ct, value_ct, distinct_value_ct, min_length, max_length, avg_length".split(", ") + lst_columns = "record_ct, value_ct, distinct_value_ct".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_alpha_missing_values(selected_row, form_data_width): + str_header = "Missing Values" + lst_columns = "null_value_ct, zero_length_ct, dummy_value_ct, zero_value_ct".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_numeric_missing_values(selected_row, form_data_width): + str_header = "Missing Values" + lst_columns = "null_value_ct, zero_value_ct".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_alpha_content_analysis(selected_row, form_data_width): + str_header = "Content Analysis" + lst_columns = "numeric_ct, date_ct, includes_digit_ct, embedded_space_ct, avg_embedded_spaces".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_alpha_value_analysis(selected_row, form_data_width): + str_header = "Value Analysis" + lst_columns = "min_length, max_length, avg_length, min_text, max_text, top_freq_values, distinct_pattern_ct, top_patterns, std_pattern_match".split( + ", " + ) + if selected_row["top_patterns"]: + # Need to reverse this, as it's saved | NNNN | Category | NNN | Category + str_top_patterns, str_top_patterns_display = reverse_count_category_pairs(selected_row["top_patterns"]) + selected_row["top_patterns"] = str_top_patterns_display + + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + # Now reset for graph + if selected_row["top_patterns"]: + selected_row["top_patterns"] = str_top_patterns + + +def write_numeric_value_analysis(selected_row, form_data_width): + str_header = "Values and Ranges" + lst_columns = "min_value, min_value_over_0, max_value, min_length, max_length, avg_length".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_stats_value_analysis(selected_row, form_data_width): + str_header = "Descriptive Statistics" + lst_columns = "avg_value, stdev_value, percentile_25, percentile_50, percentile_75".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_date_analysis(selected_row, form_data_width): + str_header = "Date Value Analysis" + lst_columns = "min_date, max_date, before_1yr_date_ct, before_5yr_date_ct, within_1yr_date_ct, within_1mo_date_ct, future_date_ct".split( + ", " + ) + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_boolean_analysis(selected_row, form_data_width): + str_header = "Boolean Value Analysis" + lst_columns = "boolean_true_ct".split(", ") + fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) + + +def write_missing_values_graph(value_ct, null_value_ct, zero_length_ct, dummy_values_ct): + lst_status = ["Value Present", "Null Value"] + lst_ct = [value_ct, null_value_ct] + + if zero_length_ct: + lst_status.append("Zero-Length") + lst_ct.append(zero_length_ct) + if dummy_values_ct: + lst_status.append("Dummy Value") + lst_ct.append(dummy_values_ct) + + dfg = pd.DataFrame({"Status": lst_status, "Count": lst_ct}) + + # fig = px.bar(dfg, x='Count', y='Status', orientation='h', title='Missing Values') + fig = px.pie(dfg, values="Count", names="Status", title="Missing Values") + # Show percentage in the pie chart + fig.update_traces(textinfo="percent+label") + fig.update_layout( + width=400, + title_font={"color": "green"}, + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(0,0,0,0)", + ) + + # Create the stacked bar chart + st.plotly_chart(fig, use_container_width=True) + + +def write_top_freq_graph(input_str): + lines = input_str.strip().split("\n") + + # Initialize empty lists to store categories and frequencies + categories = [] + frequencies = [] + + # Loop through each line to extract category and frequency + for line in lines: + parts = line.split(" | ") + # Remove the leading pipe character from the category + category = parts[0].replace("| ", "").strip() + frequency = int(parts[1]) + + categories.append(category) + frequencies.append(frequency) + + # Create a Pandas DataFrame + dff = pd.DataFrame({"Value": categories, "Frequency": frequencies}) + + # Calculate the total count and percentages + total_count = dff["Frequency"].sum() + dff["pct"] = (dff["Frequency"] / total_count * 100).round(2) + + # Create the Plotly Express histogram + fig = px.bar(dff, x="Value", y="Frequency", title="Value Frequency", text=dff["pct"].apply(lambda x: f"{x}%")) + # Update the trace to position text labels + fig.update_traces(textposition="outside") + fig.update_xaxes(type="category") + fig.update_layout( + width=400, + height=500, + title_font={"color": "green"}, + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(0,0,0,0)", + ) + + st.plotly_chart(fig) + + +def write_top_patterns_graph(input_str): + # Split the string by ' | ' to get individual elements + elements = input_str.split(" | ") + + # Initialize empty lists to store categories and frequencies + categories = [] + frequencies = [] + + # Loop to populate the lists with data + for i in range(0, len(elements), 2): + categories.append(elements[i]) + frequencies.append(int(elements[i + 1])) # Convert string to integer for count + + # Create a DataFrame using the populated lists + dff = pd.DataFrame({"Category": categories, "Frequency": frequencies}) + + # Create the Plotly Express histogram + fig = px.bar(dff, x="Category", y="Frequency", title="Top Patterns") + fig.update_layout( + width=400, + title_font={"color": "green"}, + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(0,0,0,0)", + ) + + st.plotly_chart(fig) + + +def write_box_plot(min_value, max_value, avg_value, stdev_value, percentile_25, percentile_75): + # Pick right IQR values + iqr_25 = percentile_25 if percentile_25 else avg_value - stdev_value + iqr_75 = percentile_75 if percentile_75 else avg_value + stdev_value + + # Create a DataFrame for the box plot + df = pd.DataFrame( + { + # "Value": [min_value, avg_value - stdev_value, avg_value, avg_value + stdev_value, max_value], + "Value": [min_value, iqr_25, avg_value, iqr_75, max_value], + "Category": ["Data Distribution"] * 5, + } + ) + + # Create a box plot + fig = px.box(df, y="Value", title="Summary Stats", labels={"Value": "Value"}) + + # Add Dot plot for min, max, and average + # fig.add_scatter( + # y=[min_value, avg_value, max_value], + # mode="markers", + # marker={"size": [10, 15, 10], "color": ["blue", "green", "red"]}, + # name="Min, Avg, Max", + # ) + + # Add line for standard deviation + fig.add_shape( + go.layout.Shape( + type="line", + x0=0.5, + x1=0.5, + y0=avg_value - stdev_value, + y1=avg_value + stdev_value, + line={ + "color": "Purple", + "width": 4, + "dash": "dot", + }, + ) + ) + + fig.update_layout( + width=400, + title_font={"color": "green"}, + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(0,0,0,0)", + ) + st.plotly_chart(fig) + + +def show_profiling_detail(selected_row, form_data_width=400): + write_profile_screen(selected_row) + + layout_column_1, layout_column_2 = st.columns([0.5, 0.5]) + + with layout_column_1: + write_column_header(selected_row, form_data_width) + write_shared_header(selected_row, form_data_width) + if selected_row["general_type_abbr"] == "A": + write_alpha_missing_values(selected_row, form_data_width) + write_alpha_content_analysis(selected_row, form_data_width) + write_alpha_value_analysis(selected_row, form_data_width) + elif selected_row["general_type_abbr"] == "N": + write_numeric_missing_values(selected_row, form_data_width) + write_numeric_value_analysis(selected_row, form_data_width) + write_stats_value_analysis(selected_row, form_data_width) + elif selected_row["general_type_abbr"] == "D": + write_date_analysis(selected_row, form_data_width) + # elif selected_row['general_type_abbr'] == "T": + elif selected_row["general_type_abbr"] == "B": + write_boolean_analysis(selected_row, form_data_width) + + with layout_column_2: + if selected_row["avg_value"] is not None: + write_box_plot( + selected_row["min_value"], + selected_row["max_value"], + selected_row["avg_value"], + selected_row["stdev_value"], + selected_row["percentile_25"], + selected_row["percentile_75"], + ) + if selected_row["top_freq_values"] is not None: + write_top_freq_graph(selected_row["top_freq_values"]) + if selected_row["top_patterns"] is not None: + write_top_patterns_graph(selected_row["top_patterns"]) + write_missing_values_graph( + selected_row["value_ct"], + selected_row["null_value_ct"], + selected_row["zero_length_ct"], + selected_row["dummy_value_ct"], + ) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index 3d57c93..4768e05 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -5,6 +5,7 @@ import plotly.graph_objects as go import streamlit as st +import testgen.ui.queries.profiling_queries as profiling_queries import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq @@ -12,6 +13,8 @@ from testgen.common import date_service from testgen.ui.navigation.page import Page from testgen.ui.session import session +from testgen.ui.views.profiling_details import show_profiling_detail + FORM_DATA_WIDTH = 400 @@ -23,7 +26,7 @@ class ProfilingResultsPage(Page): ] def render(self) -> None: - fm.render_page_header( + export_container = fm.render_page_header( "Data Profiling Results", "https://docs.datakitchen.io/article/dataops-testgen-help/investigate-profiling", lst_breadcrumbs=[ @@ -51,7 +54,9 @@ def render(self) -> None: # Retrieve Choices data if str_profile_run_id: # Lookup profiling run date and table group name from passed profile run - str_lookfor_run_date, str_lookfor_table_group = lookup_db_parentage_from_run(str_profile_run_id) + str_lookfor_run_date, str_lookfor_table_group = profiling_queries.lookup_db_parentage_from_run( + str_profile_run_id + ) str_lookfor_run_date = date_service.get_timezoned_timestamp(st.session_state, str_lookfor_run_date) else: str_lookfor_run_date = "" @@ -59,14 +64,14 @@ def render(self) -> None: with tool_bar.long_slots[0]: # Prompt for Table Group (with passed default) - df = run_table_groups_lookup_query(str_project) + df = profiling_queries.run_table_groups_lookup_query(str_project) str_table_groups_id = fm.render_select( "Table Group", df, "table_groups_name", "id", True, str_lookfor_table_group, True ) with tool_bar.long_slots[1]: # Prompt for Profile Run (with passed default) - df = get_db_profile_run_choices(str_table_groups_id) + df = profiling_queries.get_db_profile_run_choices(str_table_groups_id) date_service.create_timezoned_column_in_dataframe( st.session_state, df, "profile_run_date_with_timezone", "profile_run_date" ) @@ -79,13 +84,13 @@ def render(self) -> None: with tool_bar.long_slots[2]: # Prompt for Table Name - df = run_table_lookup_query(str_table_groups_id) + df = profiling_queries.run_table_lookup_query(str_table_groups_id) str_table_name = fm.render_select("Table Name", df, "table_name", "table_name", False) with tool_bar.long_slots[3]: # Prompt for Column Name if str_table_name: - df = run_column_lookup_query(str_table_groups_id, str_table_name) + df = profiling_queries.run_column_lookup_query(str_table_groups_id, str_table_name) str_column_name = fm.render_select("Column Name", df, "column_name", "column_name", False) if not str_column_name: # Use SQL wildcard to match all values @@ -97,475 +102,94 @@ def render(self) -> None: # Display main results grid if str_profile_run_id: - df, show_columns = get_main_dataset(str_profile_run_id, str_table_name, str_column_name) + df = profiling_queries.get_profiling_detail(str_profile_run_id, str_table_name, str_column_name) + show_columns = [ + "schema_name", + "table_name", + "column_name", + "column_type", + "functional_data_type", + "anomalies", + ] # Show CREATE script button if len(df) > 0 and str_table_name != "%%": - # if tool_bar.button_slots[0].button("📜", help="Show table CREATE script with suggested datatypes"): with st.expander("📜 **Table CREATE script with suggested datatypes**"): st.code(generate_create_script(df), "sql") selected_row = fm.render_grid_select(df, show_columns) + with export_container: + lst_export_columns = [ + "schema_name", + "table_name", + "column_name", + "position", + "column_type", + "general_type", + "functional_table_type", + "functional_data_type", + "datatype_suggestion", + "anomalies", + "record_ct", + "value_ct", + "distinct_value_ct", + "top_freq_values", + "null_value_ct", + "min_length", + "max_length", + "avg_length", + "distinct_std_value_ct", + "numeric_ct", + "date_ct", + "dummy_value_ct", + "zero_length_ct", + "lead_space_ct", + "quoted_value_ct", + "includes_digit_ct", + "embedded_space_ct", + "avg_embedded_spaces", + "min_text", + "max_text", + "std_pattern_match", + "distinct_pattern_ct", + "top_patterns", + "distinct_value_hash", + "min_value", + "min_value_over_0", + "max_value", + "avg_value", + "stdev_value", + "percentile_25", + "percentile_50", + "percentile_75", + "zero_value_ct", + "fractional_sum", + "min_date", + "max_date", + "before_1yr_date_ct", + "before_5yr_date_ct", + "within_1yr_date_ct", + "within_1mo_date_ct", + "future_date_ct", + "date_days_present", + "date_weeks_present", + "date_months_present", + "boolean_true_ct", + ] + lst_wrap_columns = ["top_freq_values", "top_patterns"] + str_caption = "{TIMESTAMP}" + fm.render_excel_export(df, lst_export_columns, "Profiling Results", str_caption, lst_wrap_columns) + # Display profiling for selected row if not selected_row: st.markdown(":orange[Select a row to see profiling details.]") else: - show_record_detail(selected_row[0]) + show_profiling_detail(selected_row[0], FORM_DATA_WIDTH) else: st.markdown(":orange[Select a profiling run.]") -@st.cache_data(show_spinner=False) -def run_table_groups_lookup_query(str_project_code): - str_schema = st.session_state["dbschema"] - return dq.run_table_groups_lookup_query(str_schema, str_project_code) - - -@st.cache_data(show_spinner=False) -def get_db_profile_run_choices(str_table_groups_id): - str_schema = st.session_state["dbschema"] - # Define the query - str_sql = f""" - SELECT DISTINCT profiling_starttime as profile_run_date, id - FROM {str_schema}.profiling_runs pr - WHERE pr.table_groups_id = '{str_table_groups_id}' - ORDER BY profiling_starttime DESC; - """ - # Retrieve and return data as df - return db.retrieve_data(str_sql) - - -@st.cache_data(show_spinner=False) -def run_table_lookup_query(str_table_groups_id): - str_schema = st.session_state["dbschema"] - str_sql = f""" - SELECT DISTINCT table_name - FROM {str_schema}.profile_results - WHERE table_groups_id = '{str_table_groups_id}'::UUID - ORDER BY table_name - """ - return db.retrieve_data(str_sql) - - -@st.cache_data(show_spinner=False) -def run_column_lookup_query(str_table_groups_id, str_table_name): - str_schema = st.session_state["dbschema"] - return dq.run_column_lookup_query(str_schema, str_table_groups_id, str_table_name) - - -@st.cache_data(show_spinner=False) -def lookup_db_parentage_from_run(str_profile_run_id): - str_schema = st.session_state["dbschema"] - # Define the query - str_sql = f""" - SELECT profiling_starttime as profile_run_date, g.table_groups_name - FROM {str_schema}.profiling_runs pr - INNER JOIN {str_schema}.table_groups g - ON pr.table_groups_id = g.id - WHERE pr.id = '{str_profile_run_id}' - """ - df = db.retrieve_data(str_sql) - if not df.empty: - return df.at[0, "profile_run_date"], df.at[0, "table_groups_name"] - - -@st.cache_data(show_spinner="Retrieving Data") -def get_main_dataset(str_profile_run_id, str_table_name, str_column_name): - str_schema = st.session_state["dbschema"] - str_sql = f""" - SELECT -- Identifiers - id::VARCHAR, dk_id, - p.project_code, connection_id, p.table_groups_id::VARCHAR, - p.profile_run_id::VARCHAR, - run_date, sample_ratio, - -- Column basics - p.schema_name, p.table_name, position, p.column_name, - p.column_type, general_type as general_type_abbr, - CASE general_type - WHEN 'A' THEN 'Alpha' - WHEN 'N' THEN 'Numeric' - WHEN 'D' THEN 'Date' - WHEN 'T' THEN 'Time' - WHEN 'B' THEN 'Boolean' - ELSE 'N/A' - END as general_type, - functional_table_type, functional_data_type, - datatype_suggestion, - CASE WHEN s.column_name IS NOT NULL THEN 'Yes' END as anomalies, - -- Shared counts - record_ct, value_ct, distinct_value_ct, null_value_ct, - -- Shared except for B and X - min_length, max_length, avg_length, - -- Alpha counts - distinct_std_value_ct, - numeric_ct, date_ct, - filled_value_ct as dummy_value_ct, - zero_length_ct, lead_space_ct, quoted_value_ct, - includes_digit_ct, - embedded_space_ct, avg_embedded_spaces, - min_text, max_text, - std_pattern_match, - top_patterns, - top_freq_values, distinct_value_hash, - distinct_pattern_ct, - -- A and N - zero_value_ct, - -- Numeric - min_value, min_value_over_0, max_value, - avg_value, stdev_value, percentile_25, percentile_50, percentile_75, - fractional_sum, - -- Dates - min_date, max_date, - before_1yr_date_ct, before_5yr_date_ct, within_1yr_date_ct, within_1mo_date_ct, future_date_ct, - date_days_present, date_weeks_present, date_months_present, - -- Boolean - boolean_true_ct - FROM {str_schema}.profile_results p - LEFT JOIN (SELECT DISTINCT profile_run_id, table_name, column_name - FROM {str_schema}.profile_anomaly_results) s - ON (p.profile_run_id = s.profile_run_id - AND p.table_name = s.table_name - AND p.column_name = s.column_name) - WHERE p.profile_run_id = '{str_profile_run_id}'::UUID - AND p.table_name ILIKE '{str_table_name}' - AND p.column_name ILIKE '{str_column_name}' - ORDER BY p.schema_name, p.table_name, position; - """ - - show_columns = ["schema_name", "table_name", "column_name", "column_type", "functional_data_type", "anomalies"] - - return db.retrieve_data(str_sql), show_columns - - -@st.cache_data(show_spinner="Retrieving Details") -def get_profile_screen(str_profile_run_id, str_table_name, str_column_name): - str_schema = st.session_state["dbschema"] - # Define the query - str_sql = f""" - SELECT pr.column_name, t.anomaly_name, replace(pr.detail, ' | ', ' ') as detail - FROM {str_schema}.profile_anomaly_results pr - INNER JOIN {str_schema}.profile_anomaly_types t - ON (pr.anomaly_id = t.id) - WHERE pr.profile_run_id = '{str_profile_run_id}'::UUID - AND pr.table_name = '{str_table_name}' - AND pr.column_name = '{str_column_name}' - AND t.anomaly_name <> 'Suggested Data Type' - ORDER BY anomaly_name; - """ - # Retrieve and return data as df - return db.retrieve_data(str_sql) - - -def reverse_count_category_pairs(input_str): - # Split the string by ' | ' to get individual elements - elements = input_str.split(" | ") - # Initialize an empty list to store reversed pairs - reversed_pairs = [] - display_pairs = [] - - # Loop to populate the list with reversed pairs - for i in range(0, len(elements), 2): - count = elements[i] - category = elements[i + 1] - - # Reverse count and category - reversed_pair = f"{category} | {count}" - reversed_pairs.append(reversed_pair) - # Reverse second version, for display on separate lines - display_pair = f"{category}: {count}" - display_pairs.append(display_pair) - - # Join the reversed pairs back into a single string - reversed_str = " | ".join(reversed_pairs) - - # Join the reversed pairs back into a single string - display_str = "
".join(display_pairs) - - return reversed_str, display_str - - -def write_profile_screen(selected_row): - df_screen = get_profile_screen( - selected_row["profile_run_id"], selected_row["table_name"], selected_row["column_name"] - ) - if not df_screen.empty: - with st.expander("**Profiling Anomalies**"): - # fm.render_markdown_table(df_screen, ["column_name", "anomaly_name", "detail"]) - st.dataframe(df_screen, use_container_width=True, hide_index=True) - - -def write_column_header(selected_row): - str_header = "Profiling Results" - lst_columns = [ - "column_name", - "table_name", - "schema_name", - "general_type", - "column_type", - "functional_data_type", - "datatype_suggestion", - ] - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_shared_header(selected_row): - str_header = "Data Overview" - # lst_columns = "record_ct, value_ct, distinct_value_ct, min_length, max_length, avg_length".split(", ") - lst_columns = "record_ct, value_ct, distinct_value_ct".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_alpha_missing_values(selected_row): - str_header = "Missing Values" - lst_columns = "null_value_ct, zero_length_ct, dummy_value_ct, zero_value_ct".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_numeric_missing_values(selected_row): - str_header = "Missing Values" - lst_columns = "null_value_ct, zero_value_ct".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_alpha_content_analysis(selected_row): - str_header = "Content Analysis" - lst_columns = "numeric_ct, date_ct, includes_digit_ct, embedded_space_ct, avg_embedded_spaces".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_alpha_value_analysis(selected_row): - str_header = "Value Analysis" - lst_columns = "min_length, max_length, avg_length, min_text, max_text, top_freq_values, distinct_pattern_ct, top_patterns, std_pattern_match".split( - ", " - ) - if selected_row["top_patterns"]: - # Need to reverse this, as it's saved | NNNN | Category | NNN | Category - str_top_patterns, str_top_patterns_display = reverse_count_category_pairs(selected_row["top_patterns"]) - selected_row["top_patterns"] = str_top_patterns_display - - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - # Now reset for graph - if selected_row["top_patterns"]: - selected_row["top_patterns"] = str_top_patterns - - -def write_numeric_value_analysis(selected_row): - str_header = "Values and Ranges" - lst_columns = "min_value, min_value_over_0, max_value, min_length, max_length, avg_length".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_stats_value_analysis(selected_row): - str_header = "Descriptive Statistics" - lst_columns = "avg_value, stdev_value, percentile_25, percentile_50, percentile_75".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_date_analysis(selected_row): - str_header = "Date Value Analysis" - lst_columns = "min_date, max_date, before_1yr_date_ct, before_5yr_date_ct, within_1yr_date_ct, within_1mo_date_ct, future_date_ct".split( - ", " - ) - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_boolean_analysis(selected_row): - str_header = "Boolean Value Analysis" - lst_columns = "boolean_true_ct".split(", ") - fm.render_html_list(selected_row, lst_columns, str_header, FORM_DATA_WIDTH) - - -def write_missing_values_graph(value_ct, null_value_ct, zero_length_ct, dummy_values_ct): - lst_status = ["Value Present", "Null Value"] - lst_ct = [value_ct, null_value_ct] - - if zero_length_ct: - lst_status.append("Zero-Length") - lst_ct.append(zero_length_ct) - if dummy_values_ct: - lst_status.append("Dummy Value") - lst_ct.append(dummy_values_ct) - - dfg = pd.DataFrame({"Status": lst_status, "Count": lst_ct}) - - # fig = px.bar(dfg, x='Count', y='Status', orientation='h', title='Missing Values') - fig = px.pie(dfg, values="Count", names="Status", title="Missing Values") - # Show percentage in the pie chart - fig.update_traces(textinfo="percent+label") - fig.update_layout( - width=400, - title_font={"color": "green"}, - paper_bgcolor="rgba(0,0,0,0)", - plot_bgcolor="rgba(0,0,0,0)", - ) - - # Create the stacked bar chart - st.plotly_chart(fig, use_container_width=True) - - -def write_top_freq_graph(input_str): - lines = input_str.strip().split("\n") - - # Initialize empty lists to store categories and frequencies - categories = [] - frequencies = [] - - # Loop through each line to extract category and frequency - for line in lines: - parts = line.split(" | ") - # Remove the leading pipe character from the category - category = parts[0].replace("| ", "").strip() - frequency = int(parts[1]) - - categories.append(category) - frequencies.append(frequency) - - # Create a Pandas DataFrame - dff = pd.DataFrame({"Value": categories, "Frequency": frequencies}) - - # Calculate the total count and percentages - total_count = dff["Frequency"].sum() - dff["pct"] = (dff["Frequency"] / total_count * 100).round(2) - - # Create the Plotly Express histogram - fig = px.bar(dff, x="Value", y="Frequency", title="Value Frequency", text=dff["pct"].apply(lambda x: f"{x}%")) - # Update the trace to position text labels - fig.update_traces(textposition="outside") - fig.update_xaxes(type="category") - fig.update_layout( - width=400, - height=500, - title_font={"color": "green"}, - paper_bgcolor="rgba(0,0,0,0)", - plot_bgcolor="rgba(0,0,0,0)", - ) - - st.plotly_chart(fig) - - -def write_top_patterns_graph(input_str): - # Split the string by ' | ' to get individual elements - elements = input_str.split(" | ") - - # Initialize empty lists to store categories and frequencies - categories = [] - frequencies = [] - - # Loop to populate the lists with data - for i in range(0, len(elements), 2): - categories.append(elements[i]) - frequencies.append(int(elements[i + 1])) # Convert string to integer for count - - # Create a DataFrame using the populated lists - dff = pd.DataFrame({"Category": categories, "Frequency": frequencies}) - - # Create the Plotly Express histogram - fig = px.bar(dff, x="Category", y="Frequency", title="Top Patterns") - fig.update_layout( - width=400, - title_font={"color": "green"}, - paper_bgcolor="rgba(0,0,0,0)", - plot_bgcolor="rgba(0,0,0,0)", - ) - - st.plotly_chart(fig) - - -def write_box_plot(min_value, max_value, avg_value, stdev_value, percentile_25, percentile_75): - # Pick right IQR values - iqr_25 = percentile_25 if percentile_25 else avg_value - stdev_value - iqr_75 = percentile_75 if percentile_75 else avg_value + stdev_value - - # Create a DataFrame for the box plot - df = pd.DataFrame( - { - # "Value": [min_value, avg_value - stdev_value, avg_value, avg_value + stdev_value, max_value], - "Value": [min_value, iqr_25, avg_value, iqr_75, max_value], - "Category": ["Data Distribution"] * 5, - } - ) - - # Create a box plot - fig = px.box(df, y="Value", title="Summary Stats", labels={"Value": "Value"}) - - # Add Dot plot for min, max, and average - # fig.add_scatter( - # y=[min_value, avg_value, max_value], - # mode="markers", - # marker={"size": [10, 15, 10], "color": ["blue", "green", "red"]}, - # name="Min, Avg, Max", - # ) - - # Add line for standard deviation - fig.add_shape( - go.layout.Shape( - type="line", - x0=0.5, - x1=0.5, - y0=avg_value - stdev_value, - y1=avg_value + stdev_value, - line={ - "color": "Purple", - "width": 4, - "dash": "dot", - }, - ) - ) - - fig.update_layout( - width=400, - title_font={"color": "green"}, - paper_bgcolor="rgba(0,0,0,0)", - plot_bgcolor="rgba(0,0,0,0)", - ) - st.plotly_chart(fig) - - -def show_record_detail(selected_row): - write_profile_screen(selected_row) - - layout_column_1, layout_column_2 = st.columns([0.5, 0.5]) - - with layout_column_1: - write_column_header(selected_row) - write_shared_header(selected_row) - if selected_row["general_type_abbr"] == "A": - write_alpha_missing_values(selected_row) - write_alpha_content_analysis(selected_row) - write_alpha_value_analysis(selected_row) - elif selected_row["general_type_abbr"] == "N": - write_numeric_missing_values(selected_row) - write_numeric_value_analysis(selected_row) - write_stats_value_analysis(selected_row) - elif selected_row["general_type_abbr"] == "D": - write_date_analysis(selected_row) - # elif selected_row['general_type_abbr'] == "T": - elif selected_row["general_type_abbr"] == "B": - write_boolean_analysis(selected_row) - - with layout_column_2: - if selected_row["avg_value"] is not None: - write_box_plot( - selected_row["min_value"], - selected_row["max_value"], - selected_row["avg_value"], - selected_row["stdev_value"], - selected_row["percentile_25"], - selected_row["percentile_75"], - ) - if selected_row["top_freq_values"] is not None: - write_top_freq_graph(selected_row["top_freq_values"]) - if selected_row["top_patterns"] is not None: - write_top_patterns_graph(selected_row["top_patterns"]) - write_missing_values_graph( - selected_row["value_ct"], - selected_row["null_value_ct"], - selected_row["zero_length_ct"], - selected_row["dummy_value_ct"], - ) - - def generate_create_script(df): ddf = df[["schema_name", "table_name", "column_name", "column_type", "datatype_suggestion"]].copy() ddf.fillna("", inplace=True) diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index f10a06f..291a848 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -1,5 +1,5 @@ -from time import sleep import typing +from time import sleep import streamlit as st diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 2762677..c78150a 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -44,6 +44,7 @@ def render(self, **_) -> None: "Test Definitions", "https://docs.datakitchen.io/article/dataops-testgen-help/testgen-test-types", lst_breadcrumbs=self.breadcrumbs, + boo_show_refresh=True, ) tool_bar = tb.ToolBar(5, 6, 4, None, multiline=True) @@ -233,11 +234,12 @@ def show_add_edit_modal_by_test_definition(test_definition_modal, test_definitio table_group = table_group_raw.iloc[0].to_dict() test_suite_raw = run_test_suite_lookup_query(table_group_id, test_suite_name) - test_suite = test_suite_raw.iloc[0].to_dict() + if not test_suite_raw.empty: + test_suite = test_suite_raw.iloc[0].to_dict() - show_add_edit_modal( - test_definition_modal, mode, project_code, table_group, test_suite, table_name, column_name, test_definition - ) + show_add_edit_modal( + test_definition_modal, mode, project_code, table_group, test_suite, table_name, column_name, test_definition + ) def show_add_edit_modal( @@ -740,7 +742,7 @@ def show_test_defs_grid( "profiling_as_of_date", "last_manual_update", ] - lst_wrap_colunns = ["final_test_description"] + lst_wrap_columns = ["final_test_description"] lst_export_headers = [ "Schema", "Table Name", @@ -759,7 +761,8 @@ def show_test_defs_grid( df, lst_export_columns, f"Test Definitions for Test Suite {str_test_suite_id}", - lst_wrap_colunns, + "{TIMESTAMP}", + lst_wrap_columns, lst_export_headers, ) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 05c9537..d4ab128 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -6,6 +6,7 @@ import plotly.graph_objects as go import streamlit as st +import testgen.ui.queries.profiling_queries as profiling_queries import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq @@ -16,6 +17,8 @@ from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session from testgen.ui.views.test_definitions import show_add_edit_modal_by_test_definition +from testgen.ui.views.profiling_details import show_profiling_detail + ALWAYS_SPIN = False @@ -579,7 +582,9 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ "Message", "Action", ] - fm.render_excel_export(df, lst_export_columns, "Test Results", lst_wrap_colunns, lst_export_headers) + fm.render_excel_export( + df, lst_export_columns, "Test Results", "{TIMESTAMP}", lst_wrap_colunns, lst_export_headers + ) # Display history and detail for selected row if not selected_rows: @@ -600,8 +605,12 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ pg_col1, pg_col2 = st.columns([0.5, 0.5]) with pg_col2: - _, v_col2 = st.columns([0.6, 0.4]) - view_bad_data(v_col2, selected_row) + v_col1, v_col2, v_col3 = st.columns([0.33, 0.33, 0.33]) + view_edit_test(v_col1, selected_row["test_definition_id_current"]) + view_profiling( + v_col2, selected_row["table_name"], selected_row["column_names"], selected_row["table_groups_id"] + ) + view_bad_data(v_col3, selected_row) with pg_col1: fm.show_subheader(selected_row["test_name_short"]) @@ -785,21 +794,12 @@ def view_bad_data(button_container, selected_row): str_header = f"Column: {selected_row['column_names']}, Table: {selected_row['table_name']}" bad_data_modal = testgen.Modal(title=None, key="dk-test-data-modal", max_width=1100) - edit_test_definition_modal = testgen.Modal(title="Edit Test", key="dk-test-definition-edit-modal", max_width=1100) - with button_container: if st.button( - "Review Source Data →", help="Review source data for highlighted result", use_container_width=True + ":green[Source Data →]", help="Review current source data for highlighted result", use_container_width=True ): bad_data_modal.open() - if st.button("🖊️ Edit Test", help="Edit the Test Definition", use_container_width=True): - edit_test_definition_modal.open() - - if edit_test_definition_modal.is_open(): - test_definition_id = selected_row["test_definition_id_current"] - show_add_edit_modal_by_test_definition(edit_test_definition_modal, test_definition_id) - if bad_data_modal.is_open(): with bad_data_modal.container(): fm.render_modal_header(selected_row["test_name_short"], None) @@ -830,3 +830,35 @@ def view_bad_data(button_container, selected_row): df_bad.fillna("[NULL]", inplace=True) # Display the dataframe st.dataframe(df_bad, height=500, width=1050, hide_index=True) + + +def view_profiling(button_container, str_table_name, str_column_name, str_table_groups_id): + str_header = f"Column: {str_column_name}, Table: {str_table_name}" + + # Retrieve latest profiling + str_profiling_run_id = profiling_queries.get_latest_profile_run(str_table_groups_id) + if str_profiling_run_id: + df = profiling_queries.get_profiling_detail(str_profiling_run_id, str_table_name, str_column_name) + if not df.empty: + profiling_modal = testgen.Modal(title=None, key="dk-anomaly-profiling-modal", max_width=1100) + + with button_container: + if st.button( + ":green[Profiling →]", help="Review profiling for highlighted column", use_container_width=True + ): + profiling_modal.open() + + if profiling_modal.is_open(): + with profiling_modal.container(): + fm.render_modal_header(str_header, None) + show_profiling_detail(df.iloc[0], 300) + + +def view_edit_test(button_container, test_definition_id): + edit_test_definition_modal = testgen.Modal(title="Edit Test", key="dk-test-definition-edit-modal", max_width=1100) + with button_container: + if st.button("🖊️ Edit Test", help="Edit the Test Definition", use_container_width=True): + edit_test_definition_modal.open() + + if edit_test_definition_modal.is_open(): + show_add_edit_modal_by_test_definition(edit_test_definition_modal, test_definition_id) diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index b624004..c34f63b 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -1,5 +1,5 @@ -from time import sleep import typing +from time import sleep import streamlit as st diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 7ccb39d..3df1558 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -102,7 +102,7 @@ def render(self, connection_id: str | None = None, table_group_id: str | None = delete_modal.open() if tool_bar.short_slots[4].button( - "Tests →", + ":green[Tests →]", help="View and edit Test Definitions for selected Test Suite", disabled=disable_buttons, use_container_width=True, @@ -191,39 +191,45 @@ def show_record_detail( with right_column: st.write("

", unsafe_allow_html=True) - _, button_column = st.columns([0.3, 0.7]) + _, button_column = st.columns([0.2, 0.8]) with button_column: - if st.button( - "Show Test Generation Command for CLI", - help="Shows the run-test-generation CLI command", - use_container_width=True, - ): - show_run_test_generation_modal.open() - - if st.button("Run Test Generation Now", help="Run Test Generation", use_container_width=True): - run_test_generation_modal.open() - - if st.button( - "Show Test Execution Command for CLI", help="Shows the run-tests CLI command", use_container_width=True - ): - show_test_run_command_modal.open() - - if st.button("Run Test Execution Now", help="Run the tests", use_container_width=True): - run_tests_command_modal.open() - - if st.button( - "Show Observability Export Command for CLI", - help="Shows the export-observability CLI command", - use_container_width=True, - ): - show_export_command_modal.open() - - if st.button( - "Run Observability Export Now", - help="Exports test results to Observability for the current Test Suite", - use_container_width=True, - ): - run_export_command_modal.open() + run_now_commands_tab, cli_commands_tab = st.tabs(["Test Suite Actions", "View CLI Commands"]) + + with cli_commands_tab: + if st.button( + "Test Generation Command", + help="Shows the run-test-generation CLI command", + use_container_width=True, + ): + show_run_test_generation_modal.open() + + if st.button( + "Test Execution Command", + help="Shows the run-tests CLI command", + use_container_width=True, + ): + show_test_run_command_modal.open() + + if st.button( + "Observability Export Command", + help="Shows the export-observability CLI command", + use_container_width=True, + ): + show_export_command_modal.open() + + with run_now_commands_tab: + if st.button("Run Test Generation", help="Run Test Generation", use_container_width=True): + run_test_generation_modal.open() + + if st.button("Run Test Execution", help="Run the tests", use_container_width=True): + run_tests_command_modal.open() + + if st.button( + "Run Observability Export", + help="Exports test results to Observability for the current Test Suite", + use_container_width=True, + ): + run_export_command_modal.open() def show_run_test_generation(modal, selected): @@ -234,9 +240,51 @@ def show_run_test_generation(modal, selected): with container: st.markdown(":green[**Execute Test Generation for the Test Suite**]") + warning_container = st.container() + options_container = st.container() button_container = st.empty() status_container = st.empty() + test_ct, unlocked_test_ct, unlocked_edits_ct = test_suite_service.get_test_suite_refresh_warning( + selected_test_suite["test_suite"] + ) + if test_ct: + warning_msg = "" + counts_msg = f"\n\nTests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}" + if unlocked_edits_ct > 0: + if unlocked_edits_ct > 1: + warning_msg = ( + "Manual changes have been made to tests in this Test Suite that have not been locked. " + ) + else: + warning_msg = ( + "A manual change has been made to a test in this Test Suite that has not been locked. " + ) + elif unlocked_test_ct > 0: + warning_msg = "Auto-generated tests are present in this Test Suite that have not been locked. " + warning_msg = ( + f"{warning_msg}Generating tests now will overwrite all unlocked tests currently in the " + f"test suite with new tests based on the latest profiling.{counts_msg}" + ) + with warning_container: + st.warning(warning_msg) + if unlocked_edits_ct > 0: + lock_edits_button = st.button("Lock Edited Tests") + if lock_edits_button: + edits_locked = test_suite_service.lock_edited_tests(selected_test_suite["test_suite"]) + if edits_locked: + st.info("Edited tests have been successfully locked.") + + with options_container: + lst_generation_sets = test_suite_service.get_generation_set_choices() + if lst_generation_sets: + lst_generation_sets.insert(0, "(All Test Types)") + str_generation_set = st.selectbox("Generation Set", lst_generation_sets) + if str_generation_set == "(All Test Types)": + str_generation_set = "" + else: + str_generation_set = "" + with button_container: start_process_button_message = "Start" test_generation_button = st.button(start_process_button_message) @@ -249,7 +297,7 @@ def show_run_test_generation(modal, selected): status_container.info("Executing Test Generation...") try: - run_test_gen_queries(table_group_id, test_suite_key) + run_test_gen_queries(table_group_id, test_suite_key, str_generation_set) except Exception as e: status_container.empty() status_container.error(f"Process had errors: {e!s}.") From 54a58e7ff58e88f7224fe166b724aa8e5d5b177e Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Thu, 2 May 2024 09:17:22 -0400 Subject: [PATCH 10/19] fix(snowflake): sampling limit set to 999000 Fixes an error where the sampling query could request over 1M records when Snowflake has 1M records hard limit itself. --- .../project_get_table_sample_count.sql | 4 ++-- testgen/ui/services/form_service.py | 15 +++++++++--- testgen/ui/views/profiling_summary.py | 10 +------- testgen/ui/views/test_results.py | 23 ++++++++++++------- testgen/ui/views/test_runs.py | 10 +------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/testgen/template/profiling/project_get_table_sample_count.sql b/testgen/template/profiling/project_get_table_sample_count.sql index a2cdf55..d80a22f 100644 --- a/testgen/template/profiling/project_get_table_sample_count.sql +++ b/testgen/template/profiling/project_get_table_sample_count.sql @@ -5,7 +5,7 @@ SELECT '{SAMPLING_TABLE}' as schema_table, ELSE CASE WHEN ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) > {PROFILE_SAMPLE_MIN_COUNT} - THEN ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) + THEN LEAST(999000, ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0)) ELSE {PROFILE_SAMPLE_MIN_COUNT} END END as sample_count, @@ -15,7 +15,7 @@ SELECT '{SAMPLING_TABLE}' as schema_table, ELSE (CAST(COUNT(*) as FLOAT) / CASE WHEN ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) > {PROFILE_SAMPLE_MIN_COUNT} - THEN ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) + THEN LEAST(999000, ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0)) ELSE {PROFILE_SAMPLE_MIN_COUNT} END ) END as sample_ratio diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index f3c9a61..498e3c6 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -79,6 +79,10 @@ def __init__( read_only=False, required=False, int_key=0, + max_chars=None, + num_min=None, + num_max=None, + text_multi_lines=3, ): self.field_label = str_label self.column_name = str_column_name @@ -88,6 +92,10 @@ def __init__( self.display_only = read_only self.required = required self.key_order = int_key + self.max_chars = max_chars + self.num_min = num_min + self.num_max = num_max + self.text_multi_lines = text_multi_lines def set_select_choices(self, df_options, str_show_column_name, str_return_column_name): if self.widget in [FormWidget.selectbox, FormWidget.multiselect]: @@ -104,7 +112,7 @@ def render_widget(self, boo_form_display_only=False): match self.widget: case FormWidget.text_md: st.markdown(f"**{self.field_label}**") - st.text(self.init_value) + st.markdown(self.init_value) case FormWidget.text_input: self.value = st.text_input( @@ -112,7 +120,7 @@ def render_widget(self, boo_form_display_only=False): ) case FormWidget.text_area: - box_height = 15 * self.text_multi_lines + box_height = 26 * self.text_multi_lines self.value = st.text_area( label=self.field_label, value=self.init_value, @@ -305,7 +313,8 @@ def render_form_by_field_specs( with layout_column_1: # Render form - with st.container if boo_display_only else st.form(str_form_name, clear_on_submit=True): + layout_container = st.container() if boo_display_only else st.form(str_form_name, clear_on_submit=True) + with layout_container: if str_caption: st.caption(f":green[{str_caption}]") diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index 291a848..2d3704d 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -1,5 +1,4 @@ import typing -from time import sleep import streamlit as st @@ -38,7 +37,7 @@ def render(self) -> None: str_project = st.session_state["project"] # Setup Toolbar - tool_bar = tb.ToolBar(2, 3, 0, None) + tool_bar = tb.ToolBar(3, 2, 0, None) with tool_bar.long_slots[0]: # Table Groups selection -- optional criterion @@ -47,13 +46,6 @@ def render(self) -> None: "Table Group", df_tg, "table_groups_name", "id", boo_required=False, str_default=None ) - with tool_bar.short_slots[0]: - if st.button("⟳", help="Refresh the grid", key="refresh-button-profiling"): - st.cache_data.clear() - st.toast("Page Refreshed!") - sleep(1) - st.experimental_rerun() - df, show_columns = get_db_profiling_runs(str_project, str_table_groups_id) time_columns = ["start_time"] diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index d4ab128..e371e89 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -307,7 +307,7 @@ def get_test_definition(str_test_def_id): def get_test_definition_uncached(str_schema, str_test_def_id): str_sql = f""" SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name, - tt.test_description as description, + tt.test_description as description, tt.usage_notes, d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value, d.custom_query, d.severity, tt.default_severity, @@ -444,7 +444,7 @@ def do_source_data_lookup_custom(selected_row): return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}\n\n{str_sql}", None -def show_test_tweak_form(str_test_def_id): +def show_test_def_detail(str_test_def_id): df = get_test_definition(str_test_def_id) specs = [] @@ -453,7 +453,14 @@ def show_test_tweak_form(str_test_def_id): row = df.iloc[0] specs.append( - fm.FieldSpec("Description", "description", fm.FormWidget.text_area, row["description"], read_only=True) + fm.FieldSpec( + "Usage Notes", + "usage_notes", + fm.FormWidget.text_area, + row["usage_notes"], + read_only=True, + text_multi_lines=7, + ) ) specs.append( fm.FieldSpec( @@ -497,15 +504,15 @@ def show_test_tweak_form(str_test_def_id): "last_manual_update", fm.FormWidget.date_input, row["last_manual_update"], - date.today().strftime("%Y-%m-%d"), + date.today().strftime("%Y-%m-%d hh:mm:ss"), read_only=True, ) ) fm.render_form_by_field_specs( - "Edit Test Configuration", + None, "test_definitions", specs, - str_caption="Edit test configuration for future test runs.", + boo_display_only=True, ) @@ -619,14 +626,14 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ st.caption(empty_if_null(selected_row["measure_uom_description"])) fm.render_grid_select(dfh, show_hist_columns) with pg_col2: - ut_tab1, ut_tab2 = st.tabs(["History", "Test Configuration"]) + ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"]) with ut_tab1: if dfh.empty: st.write("Test history not available.") else: write_history_graph(dfh) with ut_tab2: - show_test_tweak_form(selected_row["test_definition_id_current"]) + show_test_def_detail(selected_row["test_definition_id_current"]) return selected_rows diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index c34f63b..20b22c0 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -1,5 +1,4 @@ import typing -from time import sleep import streamlit as st @@ -37,7 +36,7 @@ def render(self) -> None: str_project = st.session_state["project"] # Setup Toolbar - tool_bar = tb.ToolBar(4, 2, 0, None) + tool_bar = tb.ToolBar(4, 1, 0, None) with tool_bar.long_slots[0]: # Table Groups selection -- optional criterion @@ -46,13 +45,6 @@ def render(self) -> None: "Table Group", df_tg, "table_groups_name", "id", boo_required=False, str_default=None ) - with tool_bar.short_slots[0]: - if st.button("⟳", help="Refresh the grid", key="refresh-button-test-runs"): - st.cache_data.clear() - st.toast("Page Refreshed!") - sleep(1) - st.experimental_rerun() - with tool_bar.long_slots[1]: # Table Groups selection -- optional criterion df_ts = get_db_test_suite_choices(str_project, str_table_groups_id) From 34a32395427c1b8abd8ab58525984f25f5aecd46 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Fri, 3 May 2024 09:07:58 -0400 Subject: [PATCH 11/19] fix(plugins): enhance plugin support --- Dockerfile | 2 +- testgen/ui/bootstrap.py | 11 ++++++++--- testgen/ui/components/utils/callbacks.py | 2 +- testgen/ui/navigation/router.py | 10 +++++----- testgen/ui/views/profiling_anomalies.py | 5 ++--- testgen/ui/views/profiling_results.py | 6 ------ testgen/ui/views/profiling_summary.py | 4 ++-- testgen/ui/views/test_results.py | 3 +-- testgen/ui/views/test_runs.py | 2 +- testgen/utils/plugins.py | 1 - 10 files changed, 21 insertions(+), 25 deletions(-) diff --git a/Dockerfile b/Dockerfile index b676b1c..1366f18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,7 +31,7 @@ ENV PATH="$PATH:/dk/bin:/opt/mssql-tools/bin/" RUN TG_METADATA_DB_USER=- TG_METADATA_DB_PASSWORD=- TG_METADATA_DB_HOST=- TG_METADATA_DB_PORT=- testgen ui patch-streamlit ARG TESTGEN_VERSION -ENV TESTGEN_VERSION=$TESTGEN_VERSION +ENV TESTGEN_VERSION=v$TESTGEN_VERSION WORKDIR /dk diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index 6d75ea4..2333193 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -25,7 +25,7 @@ from testgen.ui.views.test_suites import TestSuitesPage from testgen.utils import plugins, singleton -BUILTIN_PAGES: list[Page] = [ +BUILTIN_PAGES: list[type[Page]] = [ LoginPage, OverviewPage, TestDefinitionsPage, @@ -82,7 +82,12 @@ def run(log_level: int = logging.INFO) -> Application: return Application( router=Router(routes=pages, default=NotFoundPage), menu=Menu( - items=[dataclasses.replace(page.menu_item, page=page.path) for page in pages if page.menu_item], + items=list( + { + page.path: dataclasses.replace(page.menu_item, page=page.path) + for page in pages if page.menu_item + }.values() + ), version=Version( current=settings.VERSION, latest="...", @@ -93,7 +98,7 @@ def run(log_level: int = logging.INFO) -> Application: ) -def _get_schema_rev(): +def _get_schema_rev() -> str: revision = session.sb_schema_rev if not revision: revision = session.sb_schema_rev = get_schema_revision() diff --git a/testgen/ui/components/utils/callbacks.py b/testgen/ui/components/utils/callbacks.py index 40addee..f486d5b 100644 --- a/testgen/ui/components/utils/callbacks.py +++ b/testgen/ui/components/utils/callbacks.py @@ -48,4 +48,4 @@ def register_callback(element_key, callback, *callback_args, **callback_kwargs): try: session_state._components_callbacks[element_key] = (callback, callback_args, callback_kwargs) except: - logger.debug("unexpected error registering component callback", exc_info=True, stack_info=True) + logger.debug("unexpected error registering component callback", exc_info=False, stack_info=False) diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py index 842c044..6a2e240 100644 --- a/testgen/ui/navigation/router.py +++ b/testgen/ui/navigation/router.py @@ -15,14 +15,14 @@ class Router(Singleton): active: testgen.ui.navigation.page.Page | None - _default: testgen.ui.navigation.page.Page | None - _routes: dict[str, testgen.ui.navigation.page.Page] + _default: type[testgen.ui.navigation.page.Page] | None + _routes: dict[str, type[testgen.ui.navigation.page.Page]] def __init__( self, /, - routes: list[testgen.ui.navigation.page.Page], - default: testgen.ui.navigation.page.Page = None, + routes: list[type[testgen.ui.navigation.page.Page]], + default: type[testgen.ui.navigation.page.Page] | None = None, ) -> None: self._routes = {} @@ -43,7 +43,7 @@ def navigate(self, /, to: str, with_args: dict | None = None) -> None: if type(can_activate) == str: return self.navigate(to=can_activate, with_args={}) - if not can_activate: + if not can_activate and self._default: return self.navigate(to=self._default.path, with_args=with_args) if not isinstance(self.active, route): diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index 5e13bc7..76dd3e4 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -3,16 +3,15 @@ import plotly.express as px import streamlit as st +import testgen.ui.queries.profiling_queries as profiling_queries import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.toolbar_service as tb -import testgen.ui.queries.profiling_queries as profiling_queries -from testgen.ui.views.profiling_details import show_profiling_detail - from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.session import session +from testgen.ui.views.profiling_details import show_profiling_detail class ProfilingAnomaliesPage(Page): diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index 4768e05..bfcb561 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -1,21 +1,15 @@ import typing -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go import streamlit as st import testgen.ui.queries.profiling_queries as profiling_queries -import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm -import testgen.ui.services.query_service as dq import testgen.ui.services.toolbar_service as tb from testgen.common import date_service from testgen.ui.navigation.page import Page from testgen.ui.session import session from testgen.ui.views.profiling_details import show_profiling_detail - FORM_DATA_WIDTH = 400 diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index 2d3704d..f48b759 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -109,7 +109,7 @@ def open_drill_downs(dct_selected_rows, button_slots): if dct_selected_rows: dct_selected_row = dct_selected_rows[0] - if button_slots[1].button( + if button_slots[0].button( "Profiling Results →", help="Review profiling characteristics for each data column", use_container_width=True, @@ -120,7 +120,7 @@ def open_drill_downs(dct_selected_rows, button_slots): session.current_page_args = {} st.experimental_rerun() - if button_slots[2].button( + if button_slots[1].button( "Anomalies →", help="Review potential data problems identified in profiling", use_container_width=True, diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index e371e89..c7b39f9 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -16,9 +16,8 @@ from testgen.ui.navigation.page import Page from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session -from testgen.ui.views.test_definitions import show_add_edit_modal_by_test_definition from testgen.ui.views.profiling_details import show_profiling_detail - +from testgen.ui.views.test_definitions import show_add_edit_modal_by_test_definition ALWAYS_SPIN = False diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 20b22c0..252fa9e 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -60,7 +60,7 @@ def render(self) -> None: dct_selected_rows = fm.render_grid_select(df, show_columns) dct_selected_row = dct_selected_rows[0] if dct_selected_rows else None - if tool_bar.short_slots[1].button( + if tool_bar.short_slots[0].button( "Test Results →", help="Review test results for the selected run", use_container_width=True, diff --git a/testgen/utils/plugins.py b/testgen/utils/plugins.py index d0cdd5a..1660752 100644 --- a/testgen/utils/plugins.py +++ b/testgen/utils/plugins.py @@ -1,5 +1,4 @@ import dataclasses -import importlib import importlib.metadata import typing From 6b28ae5d3b59eb9282a88dd18427831da4123ede Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Tue, 7 May 2024 07:47:00 -0400 Subject: [PATCH 12/19] fix(db): set initial revision to start at 0100 --- ...{0000_incremental_upgrade.sql => 0100_incremental_upgrade.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename testgen/template/dbupgrade/{0000_incremental_upgrade.sql => 0100_incremental_upgrade.sql} (100%) diff --git a/testgen/template/dbupgrade/0000_incremental_upgrade.sql b/testgen/template/dbupgrade/0100_incremental_upgrade.sql similarity index 100% rename from testgen/template/dbupgrade/0000_incremental_upgrade.sql rename to testgen/template/dbupgrade/0100_incremental_upgrade.sql From 7bad3ee4da14f396b8874c74820127ab2c891b1f Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 08:14:43 -0400 Subject: [PATCH 13/19] fix(test generation): allow nullable generation set --- testgen/commands/run_generate_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index 2356442..c4f2234 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -6,7 +6,7 @@ LOG = logging.getLogger("testgen.cli") -def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=""): +def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): if strTableGroupsID is None: raise ValueError("Table Group ID was not specified") @@ -37,7 +37,7 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=""): # Set static parms clsTests.project_code = dctParms["project_code"] clsTests.test_suite = strTestSuite - clsTests.generation_set = strGenerationSet + clsTests.generation_set = strGenerationSet if strGenerationSet is not None else "" clsTests.connection_id = str(dctParms["connection_id"]) clsTests.table_groups_id = strTableGroupsID clsTests.sql_flavor = dctParms["sql_flavor"] From 6bba5b6bd9adce264fb41f2999acb19c5530b29e Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 08:20:06 -0400 Subject: [PATCH 14/19] fix(profiling): excel exports show datetime in user tz Before the fix, the date fields in the exported excel were being displayed in UTC timezone. --- testgen/common/date_service.py | 5 +++++ testgen/ui/services/form_service.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/testgen/common/date_service.py b/testgen/common/date_service.py index 3ab4629..510bbdc 100644 --- a/testgen/common/date_service.py +++ b/testgen/common/date_service.py @@ -61,3 +61,8 @@ def get_timezoned_timestamp(streamlit_session, value): df["value"] = df["value"].dt.tz_localize("UTC").dt.tz_convert(timezone).dt.strftime("%Y-%m-%d %H:%M:%S") ret = df.iloc[0, 0] return ret + + +def get_timezoned_now(streamlit_session): + value = datetime.utcnow() + return get_timezoned_timestamp(streamlit_session, value) diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 498e3c6..e57c846 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -16,6 +16,7 @@ from st_aggrid import AgGrid, ColumnsAutoSizeMode, DataReturnMode, GridOptionsBuilder, GridUpdateMode, JsCode from streamlit_extras.no_default_selectbox import selectbox +import testgen.common.date_service as date_service import testgen.ui.services.authentication_service as authentication_service import testgen.ui.services.database_service as db from testgen.ui.components import widgets as testgen @@ -243,7 +244,7 @@ def _generate_excel_export( worksheet.write("A2", str_title, title_format) if str_caption: - str_caption = str_caption.replace("{TIMESTAMP}", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + str_caption = str_caption.replace("{TIMESTAMP}", date_service.get_timezoned_now(st.session_state)) caption_format = workbook.add_format({"italic": True, "size": 9, "valign": "top"}) worksheet.write("A3", str_caption, caption_format) From e7b39eeccb62691e979004b3fd22dfd64d49bf17 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 08:22:45 -0400 Subject: [PATCH 15/19] fix(ui): hide 'reveal password' icon button Avoid users from reveling secrets from password fields. --- testgen/ui/assets/style.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css index f9cc0c4..ddb1f5f 100644 --- a/testgen/ui/assets/style.css +++ b/testgen/ui/assets/style.css @@ -96,6 +96,10 @@ div[data-modal-container="true"] > div:first-child > div:first-child { max-height: 90vh; overflow-y: auto !important; } + +button[title="Show password text"] { + display: none; +} /* ... */ /* Dark mode */ From 2adbb4e97bf59bed42aa27e272b1512eca07e646 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 08:35:10 -0400 Subject: [PATCH 16/19] fix(tests): fix bad zip errors for mssql Test types 'Valid_US_Zip' and 'Valid_US_Zip3' in MSSQL were flagging null values incorrectly as bad zips. --- .../050_populate_new_schema_metadata.sql | 57 ++++++++++--------- .../project_profiling_query_mssql.yaml | 4 +- .../project_profiling_query_postgresql.yaml | 2 +- .../project_profiling_query_redshift.yaml | 2 +- .../project_profiling_query_snowflake.yaml | 2 +- .../create_functions_snowflake.sql | 2 +- .../project_profiling_query_trino.yaml | 2 +- 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index acc5de1..2401eb9 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -377,17 +377,17 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4033', 'Valid_Month', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('5033', 'Valid_Month', 'trino', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1034', 'Valid_US_Zip', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4034', 'Valid_US_Zip', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2034', 'Valid_US_Zip', 'snowflake', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}::VARCHAR, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5034', 'Valid_US_Zip', 'trino', 'SUM(CASE WHEN REGEXP_LIKE({COLUMN_NAME}, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3034', 'Valid_US_Zip', 'mssql', 'SUM(CASE WHEN {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9]'' OR {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'' OR {COLUMN_NAME} LIKE ''[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'' THEN 0 ELSE 1 END)', '>', '{THRESHOLD_VALUE}'), + ('1034', 'Valid_US_Zip', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4034', 'Valid_US_Zip', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2034', 'Valid_US_Zip', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5034', 'Valid_US_Zip', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3034', 'Valid_US_Zip', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('1035', 'Valid_US_Zip3', 'redshift', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4035', 'Valid_US_Zip3', 'postgresql', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('2035', 'Valid_US_Zip3', 'snowflake', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('5035', 'Valid_US_Zip3', 'trino', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('3035', 'Valid_US_Zip3', 'mssql', 'SUM(CASE WHEN LENGTH({COLUMN_NAME}) <> 3 OR TRANSLATE({COLUMN_NAME},''0123456789'','''') <> '''' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('1035', 'Valid_US_Zip3', 'redshift', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('4035', 'Valid_US_Zip3', 'postgresql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('2035', 'Valid_US_Zip3', 'snowflake', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('5035', 'Valid_US_Zip3', 'trino', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), + ('3035', 'Valid_US_Zip3', 'mssql', 'SUM(CASE WHEN TRANSLATE({COLUMN_NAME},''012345678'',''999999999'') <> ''999'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1036', 'Valid_Characters', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} ~ ''[[:cntrl:]]'' OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME} LIKE ''''''%'''''' OR {COLUMN_NAME} LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4036', 'Valid_Characters', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} ~ ''[[:cntrl:]]'' OR {COLUMN_NAME} LIKE '' %'' OR {COLUMN_NAME}::VARCHAR LIKE ''''''%'''''' OR column_name::VARCHAR LIKE ''"%"'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), @@ -436,7 +436,7 @@ VALUES ('1033', '1001', 'Profile Anomaly' , 'Suggested_Type', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1034', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1035', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (len("{COLUMN_NAME}") >= 1 and len("{COLUMN_NAME}") <= 4) OR (len("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), + ('1035', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1036', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'), ('1037', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'redshift', NULL, 'SELECT DISTINCT column_name, table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type, table_name;'), ('1038', '1006', 'Profile Anomaly' , 'No_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -457,12 +457,12 @@ VALUES ('1053', '1021', 'Profile Anomaly' , 'Unexpected US States', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1054', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1055', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a''),''[A-Z]'', ''A''),''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC;' ), + ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1057', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\\s(and|but|or|yet)\\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), ('1059', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1050', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LENGTH("{COLUMN_NAME}") >= 1 AND LENGTH("{COLUMN_NAME}") <= 4) OR (LENGTH("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'', ''error'', ''missing'', ''tbd'', ''n/a'', ''#na'', ''none'', ''null'' , ''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'', ''(error)'', ''(missing)'', ''(tbd)'', ''(n/a)'', ''(#na)'', ''(none)'', ''(null)'', ''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'', ''[error]'', ''[missing]'', ''[tbd]'', ''[n/a]'', ''[#na]'', ''[none]'', ''[null]'' , ''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; '), + ('1050', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1061', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1062', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1063', '1006', 'Profile Anomaly' , 'No_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -483,7 +483,7 @@ VALUES ('1078', '1021', 'Profile Anomaly' , 'Unexpected US States', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1079', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1080', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), - ('1081', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", ''[a-z]'', ''a'', ''g''), ''[A-Z]'', ''A'', ''g''), ''[0-9]'', ''N'', ''g'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC;' ), + ('1081', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1082', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\s(and|but|or|yet)\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), @@ -522,12 +522,12 @@ VALUES ('1115', '1001', 'Profile Anomaly' , 'Suggested_Type', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1116', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'') OR "{COLUMN_NAME}" LIKE '' '' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1117', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LEN("{COLUMN_NAME}") >= 1 AND LEN("{COLUMN_NAME}") <= 4) OR (LEN("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), + ('1117', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), ('1118', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1119', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = ''datetime'' THEN ''datetime'' WHEN data_type = ''datetime2'' THEN ''datetime'' WHEN data_type = ''varchar'' THEN ''varchar('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''char'' THEN ''char('' + CAST(character_maximum_length AS VARCHAR) + '')'' WHEN data_type = ''numeric'' THEN ''numeric('' + CAST(numeric_precision AS VARCHAR) + '','' + CAST(numeric_scale AS VARCHAR) + '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1120', '1006', 'Profile Anomaly' , 'No_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1121', '1007', 'Profile Anomaly' , 'Column_Pattern_Mismatch', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ), - ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ), + ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ), ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), @@ -543,7 +543,7 @@ VALUES ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX(''| ''+ TRIM(value) + '' |'', ''| '' + ''{DETAIL_EXPRESSION}'' + '' |'' ) ASC) as row_num FROM STRING_SPLIT(''{DETAIL_EXPRESSION}'', ''|'') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, ''abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'', ''aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN'') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC;' ), + ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";'), ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1140', '1004', 'Test Results', 'Alpha_Trunc', 'mssql', NULL, 'SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ;'), @@ -701,7 +701,7 @@ ORDER BY check_period DESC;'), ('1172', '1001', 'Profile Anomaly' , 'Suggested_Type', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1173', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1174', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (LEN("{COLUMN_NAME}") >= 1 AND LEN("{COLUMN_NAME}") <= 4) OR (LEN("{COLUMN_NAME}") > 10) OR (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), + ('1174', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1175', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1176', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1177', '1006', 'Profile Anomaly' , 'No_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -722,7 +722,7 @@ ORDER BY check_period DESC;'), ('1192', '1021', 'Profile Anomaly' , 'Unexpected US States', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1193', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1194', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), - ('1195', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT trim(split_part(''{DETAIL_EXPRESSION}'', ''|'', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, ''[a-z]'', ''a''), ''[A-Z]'', ''A''), ''[0-9]'', ''N'') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC;' ), + ('1195', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1196', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), ('1197', '1004', 'Test Results', 'Alpha_Trunc', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'), @@ -764,17 +764,18 @@ ORDER BY check_period DESC;'), ('1232', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING(''{DETAIL_EXPRESSION}'', STRPOS(''{DETAIL_EXPRESSION}'', '':'') + 2), ''|'')) GROUP BY "{COLUMN_NAME}";'), ('1233', '1043', 'Test Results', 'Valid_Characters', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''[[:cntrl:]]'' OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1234', '1043', 'Test Results', 'Valid_Characters', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''[[:cntrl:]]'' OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR column_name LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1234', '1043', 'Test Results', 'Valid_Characters', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''[[:cntrl:]]'' OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR "{COLUMN_NAME}" LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), ('1235', '1043', 'Test Results', 'Valid_Characters', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE PATINDEX(''%['' + CHAR(1) + ''-'' + CHAR(8) + CHAR(11) + CHAR(12) + CHAR(14) + ''-'' + CHAR(31) + '']%'', "{COLUMN_NAME}") > 0 OR "{COLUMN_NAME}" LIKE '' %'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' OR column_name LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1236', '1043', 'Test Results', 'Valid_Characters', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}", ''.*[[:cntrl:]].*'') OR "{COLUMN_NAME}"::VARCHAR LIKE '' %'' OR "{COLUMN_NAME}"::VARCHAR LIKE ''''''%'''''' OR "{COLUMN_NAME}"::VARCHAR LIKE ''"%"'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1237', '1044', 'Test Results', 'Valid_US_Zip', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT SIMILAR TO ''([0-9]{5} |[0-9]{5}-[0-9]{4}|[0-9]{9})'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1238', '1044', 'Test Results', 'Valid_US_Zip', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" NOT SIMILAR TO ''([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), - ('1239', '1044', 'Test Results', 'Valid_US_Zip', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9]'' OR "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'' OR "{COLUMN_NAME}" LIKE ''[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1240', '1044', 'Test Results', 'Valid_US_Zip', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$'') = False GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), - ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LENGTH("{COLUMN_NAME}") <> 3 OR TRANSLATE("{COLUMN_NAME}",''0123456789'','''') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'); + ('1237', '1044', 'Test Results', 'Valid_US_Zip', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1238', '1044', 'Test Results', 'Valid_US_Zip', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1239', '1044', 'Test Results', 'Valid_US_Zip', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1240', '1044', 'Test Results', 'Valid_US_Zip', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + + ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), + ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'); TRUNCATE TABLE variant_codings; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index ac0a170..d3e80e9 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -70,9 +70,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'PHONE_USA' WHEN CAST(SUM( CASE WHEN "{COL_NAME}" LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' - WHEN CAST(SUM( CASE WHEN "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][0-9]' - OR "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]' - OR "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]' + WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' WHEN CAST(SUM( CASE WHEN "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS NOT LIKE ' %' AND "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '[a-z0-9 _-]%' diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index edad7c7..873ec6e 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -63,7 +63,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'PHONE_USA' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' - WHEN SUM( CASE WHEN "{COL_NAME}" SIMILAR TO '([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})' + WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\w\s\-]+(? 0.9 THEN 'FILE_NAME' diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index d7f0fee..308e605 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -63,7 +63,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'PHONE_USA' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$' + WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index ca41312..011c80b 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -64,7 +64,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'PHONE_USA' WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$') + WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME' diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql index ae26031..f271a24 100644 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql +++ b/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql @@ -31,7 +31,7 @@ SELECT CASE WHEN TRY_TO_DATE(strparm, 'YYYYMMDDHHMISS') IS NOT NULL THEN 1 -- YYYYMMDD - WHEN TRY_TO_DATE(strparm, 'YYYYMMDD') IS NOT NULL THEN 1 + WHEN LENGTH(strparm) = 8 AND TRY_TO_DATE(strparm, 'YYYYMMDD') IS NOT NULL THEN 1 -- YYYY-MON-DD HH:MM:SS SSSSSS --WHEN TRY_TO_DATE(strparm, 'YYYY-MON-DD HH:MI:SS SSSSSS') IS NOT NULL THEN 1 diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index 7967003..e9006c9 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -63,7 +63,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'PHONE_USA' WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}" , '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') = TRUE THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'EMAIL' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}" , '^([0-9]{5}|[0-9]{5}-[0-9]{4}|[0-9]{9})$') = TRUE + WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'ZIP_USA' WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}" , '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') = TRUE THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'FILE_NAME' From 6ace65270cefc91de898994be70105baf4996aab Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 08:55:29 -0400 Subject: [PATCH 17/19] fix: several pre-release bug fixes --- .../commands/queries/generate_tests_query.py | 6 ++ testgen/commands/run_generate_tests.py | 11 +-- .../commands/run_test_parameter_validation.py | 68 ++++++++++--------- .../gen_funny_cat_tests/gen_test_constant.sql | 15 ++-- .../gen_test_distinct_value_ct.sql | 12 ++-- .../gen_funny_cat_tests/gen_test_row_ct.sql | 15 ++-- .../gen_test_row_ct_pct.sql | 15 ++-- .../generation/gen_delete_old_tests.sql | 10 +++ .../generation/gen_standard_tests.sql | 10 +-- .../refresh_data_chars_from_profiling.sql | 10 ++- testgen/ui/queries/test_suite_queries.py | 11 +-- testgen/ui/services/form_service.py | 1 + testgen/ui/views/test_definitions.py | 5 ++ testgen/ui/views/test_suites.py | 16 ++--- 14 files changed, 110 insertions(+), 95 deletions(-) create mode 100644 testgen/template/generation/gen_delete_old_tests.sql diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index a26fe28..3828afc 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -84,3 +84,9 @@ def GetTestQueriesFromGenericFile(self, booClean: bool): if booClean: strQuery = CleanSQL(strQuery) return strQuery + + def GetDeleteOldTestsQuery(self, booClean: bool): + strQuery = self.ReplaceParms(read_template_sql_file("gen_delete_old_tests.sql", "generation")) + if booClean: + strQuery = CleanSQL(strQuery) + return strQuery diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index c4f2234..d9d31d0 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -56,9 +56,12 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): LOG.info("CurrentStep: Compiling Test Gen Queries") - lstCustomTemplateQueries = clsTests.GetTestDerivationQueriesAsList(booClean) + lstFunnyTemplateQueries = clsTests.GetTestDerivationQueriesAsList(booClean) lstGenericTemplateQueries = [] + # Delete old Tests + strDeleteQuery = clsTests.GetDeleteOldTestsQuery(booClean) + # Retrieve test_types as parms from list of dictionaries: test_type, selection_criteria, default_parm_columns, # default_parm_values strQuery = clsTests.GetTestTypesSQL(booClean) @@ -87,10 +90,10 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): if strQuery: lstGenericTemplateQueries.append(strQuery) - LOG.info("Test Gen Queries were compiled") + LOG.info("TestGen CAT Queries were compiled") - # Make sure generic test gen runs before the template gen - lstQueries = lstGenericTemplateQueries + lstCustomTemplateQueries + # Make sure delete, then generic templates run before the funny templates + lstQueries = [strDeleteQuery, *lstGenericTemplateQueries, *lstFunnyTemplateQueries] if lstQueries: LOG.info("Running Test Generation Template Queries") diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index 51af78f..efa2ace 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -45,38 +45,40 @@ def run_parameter_validation_queries( lstTestColumns = RetrieveDBResultsToDictList("DKTG", strColumnList) if len(lstTestColumns) == 0: - LOG.warning("Test Column list is empty") - # Derive test schema list -- make CSV string from list of columns - # to be used as criteria for retrieving data dictionary - setSchemas = {s["columns"].split(".")[0] for s in lstTestColumns} - strSchemas = ", ".join([f"'{value}'" for value in setSchemas]) - LOG.debug("Test column list successfully retrieved") - - # Retrieve Project Column list - LOG.info("CurrentStep: Retrieve Test Columns for Validation") - clsExecute.test_schemas = strSchemas - strProjectColumnList = clsExecute.GetProjectTestValidationColumns() - if "where table_schema in ()" in strProjectColumnList: - raise ValueError("No schema specified in Validation Columns check") - lstProjectTestColumns = RetrieveDBResultsToDictList("PROJECT", strProjectColumnList) - - if len(lstProjectTestColumns) == 0: - LOG.info("Project Test Column list is empty") - - LOG.debug("Project column list successfully received") - LOG.info("CurrentStep: Compare column sets") - # load results into sets - result_set1 = {item["columns"].lower() for item in set(lstTestColumns)} - result_set2 = {item["columns"].lower() for item in set(lstProjectTestColumns)} - - # Check if all columns exist in the table - missing_columns = result_set1.difference(result_set2) - - if len(missing_columns) == 0: - LOG.info("No missing column in Project Column list.") - - strMissingColumns = ", ".join(f"'{x}'" for x in missing_columns) - srtNoQuoteMissingCols = strMissingColumns.replace("'", "") + LOG.warning(f"No test columns are present to validate in Test Suite {strTestSuite}") + missing_columns = [] + else: + # Derive test schema list -- make CSV string from list of columns + # to be used as criteria for retrieving data dictionary + setSchemas = {s["columns"].split(".")[0] for s in lstTestColumns} + strSchemas = ", ".join([f"'{value}'" for value in setSchemas]) + LOG.debug("Test column list successfully retrieved") + + # Retrieve Project Column list + LOG.info("CurrentStep: Retrieve Test Columns for Validation") + clsExecute.test_schemas = strSchemas + strProjectColumnList = clsExecute.GetProjectTestValidationColumns() + if "where table_schema in ()" in strProjectColumnList: + raise ValueError("No schema specified in Validation Columns check") + lstProjectTestColumns = RetrieveDBResultsToDictList("PROJECT", strProjectColumnList) + + if len(lstProjectTestColumns) == 0: + LOG.info("Project Test Column list is empty") + + LOG.debug("Project column list successfully received") + LOG.info("CurrentStep: Compare column sets") + # load results into sets + result_set1 = {item["columns"].lower() for item in set(lstTestColumns)} + result_set2 = {item["columns"].lower() for item in set(lstProjectTestColumns)} + + # Check if all columns exist in the table + missing_columns = result_set1.difference(result_set2) + + if len(missing_columns) == 0: + LOG.info("No missing column in Project Column list.") + + strMissingColumns = ", ".join(f"'{x}'" for x in missing_columns) + srtNoQuoteMissingCols = strMissingColumns.replace("'", "") if missing_columns: LOG.debug("Test Columns are missing in target database: %s", srtNoQuoteMissingCols) @@ -127,4 +129,4 @@ def run_parameter_validation_queries( LOG.info("Validation Complete: tests referencing missing columns have been disabled.") else: - LOG.info("Validation Successful: All columns exist in target database.") + LOG.info("Validation Successful: No columns missing from target database.") diff --git a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql index de1d080..04434ac 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql @@ -1,11 +1,3 @@ --- First delete old tests that are not locked -DELETE FROM test_definitions -WHERE project_code = '{PROJECT_CODE}' - AND table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite = '{TEST_SUITE}' - AND test_type = 'Constant' - AND COALESCE(lock_refresh, 'N') <> 'Y'; - -- Then insert new tests where a locked test is not already present INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, @@ -91,7 +83,12 @@ newtests AS ( SELECT 'Constant'::VARCHAR AS test_type, INNER JOIN rightcols r ON (c.schema_name = r.schema_name AND c.table_name = r.table_name - AND c.column_name = r.column_name) ) + AND c.column_name = r.column_name) + LEFT JOIN generation_sets s + ON ('Constant' = s.test_type + AND '{GENERATION_SET}' = s.generation_set) + WHERE (s.generation_set IS NOT NULL + OR '{GENERATION_SET}' = '') ) SELECT n.project_code, '{TABLE_GROUPS_ID}'::UUID as table_groups_id, n.profile_run_id, n.test_type, n.test_suite, n.schema_name, n.table_name, n.column_name, 0 as skip_errors, '{RUN_DATE}'::TIMESTAMP as auto_gen_date, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql index ecab786..bab7bfd 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql @@ -1,6 +1,5 @@ --- FIRST VERSION AND DELETE PART HANDLED IN SEPARATE SQL FILE gen_standard_tests.sql using generic parameters - --- Second version: constants with changing values (1 distinct value) +-- FIRST TYPE OF CONSTANT IS HANDLED IN SEPARATE SQL FILE gen_standard_tests.sql using generic parameters +-- Second type: constants with changing values (1 distinct value) INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name, column_name, skip_errors, last_auto_gen_date, test_active, @@ -77,7 +76,12 @@ newtests AS ( SELECT 'Distinct_Value_Ct'::VARCHAR AS test_type, INNER JOIN rightcols r ON (c.schema_name = r.schema_name AND c.table_name = r.table_name - AND c.column_name = r.column_name) ) + AND c.column_name = r.column_name) + LEFT JOIN generation_sets s + ON ('Distinct_Value_Ct' = s.test_type + AND '{GENERATION_SET}' = s.generation_set) + WHERE (s.generation_set IS NOT NULL + OR '{GENERATION_SET}' = '') ) SELECT n.project_code, n.table_groups_id, n.profile_run_id, n.test_type, n.test_suite, n.schema_name, n.table_name, n.column_name, 0 as skip_errors, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql b/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql index bdaf3e0..e5b9c9a 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql @@ -1,12 +1,4 @@ --- First delete old tests that are not locked -DELETE FROM test_definitions - WHERE project_code = '{PROJECT_CODE}' - AND table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite = '{TEST_SUITE}' - AND COALESCE(lock_refresh, 'N') <> 'Y' - AND test_type='Row_Ct'; - --- Then insert new tests where a locked test is not already present +-- Insert new tests where a locked test is not already present INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name, skip_errors, threshold_value, @@ -40,8 +32,13 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date table_name, MAX(record_ct) as record_ct FROM curprof c + LEFT JOIN generation_sets s + ON ('Row_Ct' = s.test_type + AND '{GENERATION_SET}' = s.generation_set) WHERE schema_name = '{DATA_SCHEMA}' AND functional_table_type LIKE '%cumulative%' + AND (s.generation_set IS NOT NULL + OR '{GENERATION_SET}' = '') GROUP BY project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name ) SELECT n.project_code, n.table_groups_id, n.profile_run_id, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql b/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql index 67ccff0..a18f15f 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql @@ -1,12 +1,4 @@ --- First delete old tests that are not locked -DELETE FROM test_definitions - WHERE project_code = '{PROJECT_CODE}' - AND table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite = '{TEST_SUITE}' - AND COALESCE(lock_refresh, 'N') <> 'Y' - AND test_type='Row_Ct_Pct'; - --- Then insert new tests where a locked test is not already present +-- Insert new tests where a locked test is not already present INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name, skip_errors, last_auto_gen_date, profiling_as_of_date, test_active, @@ -41,8 +33,13 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date table_name, MAX(record_ct) as record_ct FROM curprof + LEFT JOIN generation_sets s + ON ('Row_Ct_Pct' = s.test_type + AND '{GENERATION_SET}' = s.generation_set) WHERE schema_name = '{DATA_SCHEMA}' AND functional_table_type NOT ILIKE '%cumulative%' + AND (s.generation_set IS NOT NULL + OR '{GENERATION_SET}' = '') GROUP BY project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name HAVING MAX(record_ct) >= 500) diff --git a/testgen/template/generation/gen_delete_old_tests.sql b/testgen/template/generation/gen_delete_old_tests.sql new file mode 100644 index 0000000..da4f55e --- /dev/null +++ b/testgen/template/generation/gen_delete_old_tests.sql @@ -0,0 +1,10 @@ +DELETE FROM test_definitions + WHERE id IN ( + SELECT d.id + FROM test_definitions d + INNER JOIN test_types t + ON (d.test_type = t.test_type + AND 'CAT' = t.run_type) + WHERE d.table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND d.test_suite = '{TEST_SUITE}' + AND COALESCE(d.lock_refresh, 'N') <> 'Y' ); diff --git a/testgen/template/generation/gen_standard_tests.sql b/testgen/template/generation/gen_standard_tests.sql index 73ab68c..b916420 100644 --- a/testgen/template/generation/gen_standard_tests.sql +++ b/testgen/template/generation/gen_standard_tests.sql @@ -1,12 +1,4 @@ --- First delete old tests that are not locked -DELETE FROM test_definitions -WHERE project_code = '{PROJECT_CODE}' - AND table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite = '{TEST_SUITE}' - AND test_type = '{TEST_TYPE}' - AND COALESCE(lock_refresh, 'N') <> 'Y'; - --- Then insert new tests where a locked test is not already present +-- Insert new tests where a locked test is not already present INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name, column_name, skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, diff --git a/testgen/template/profiling/refresh_data_chars_from_profiling.sql b/testgen/template/profiling/refresh_data_chars_from_profiling.sql index 3a6cb2a..744a5ad 100644 --- a/testgen/template/profiling/refresh_data_chars_from_profiling.sql +++ b/testgen/template/profiling/refresh_data_chars_from_profiling.sql @@ -74,7 +74,8 @@ LEFT JOIN new_chars n ON (d.table_groups_id = n.table_groups_id AND d.schema_name = n.schema_name AND d.table_name = n.table_name) - WHERE n.table_name IS NULL; + WHERE data_table_chars.table_id = d.table_id + AND n.table_name IS NULL; -- ============================================================================== -- | Column Characteristics @@ -100,7 +101,8 @@ INNER JOIN data_column_chars d AND n.schema_name = d.schema_name AND n.table_name = d.table_name AND n.column_name = d.column_name) - WHERE data_column_chars.table_id = d.table_id; + WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name; -- Add new records WITH new_chars @@ -149,4 +151,6 @@ LEFT JOIN new_chars n AND d.schema_name = n.schema_name AND d.table_name = n.table_name AND d.column_name = n.column_name) - WHERE n.column_name IS NULL; + WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name + AND n.column_name IS NULL; diff --git a/testgen/ui/queries/test_suite_queries.py b/testgen/ui/queries/test_suite_queries.py index b57e384..73f08a3 100644 --- a/testgen/ui/queries/test_suite_queries.py +++ b/testgen/ui/queries/test_suite_queries.py @@ -111,10 +111,13 @@ def get_test_suite_usage(schema: str, test_suite_names: list[str]) -> pd.DataFra def get_test_suite_refresh_check(schema, test_suite_name): sql = f""" SELECT COUNT(*) as test_ct, - SUM(CASE WHEN lock_refresh = 'N' THEN 1 ELSE 0 END) as unlocked_test_ct, - SUM(CASE WHEN lock_refresh = 'N' AND last_manual_update IS NOT NULL THEN 1 ELSE 0 END) as unlocked_edits_ct - FROM {schema}.test_definitions - WHERE test_suite = '{test_suite_name}'; + SUM(CASE WHEN COALESCE(d.lock_refresh, 'N') = 'N' THEN 1 ELSE 0 END) as unlocked_test_ct, + SUM(CASE WHEN COALESCE(d.lock_refresh, 'N') = 'N' AND d.last_manual_update IS NOT NULL THEN 1 ELSE 0 END) as unlocked_edits_ct + FROM {schema}.test_definitions d + INNER JOIN {schema}.test_types t + ON (d.test_type = t.test_type) + WHERE d.test_suite = '{test_suite_name}' + AND t.run_type = 'CAT'; """ return db.retrieve_data_list(sql)[0] diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index e57c846..f3c8606 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -176,6 +176,7 @@ def render_widget(self, boo_form_display_only=False): raise ValueError(f"Widget {self.widget} is not supported.") +@st.cache_data(show_spinner=False) def _generate_excel_export( df_data, lst_export_columns, str_title=None, str_caption=None, lst_wrap_columns=None, lst_column_headers=None ): diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index c78150a..e1e1c95 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -861,26 +861,31 @@ def run_test_type_lookup_query(str_test_type=None): return dq.run_test_type_lookup_query(str_schema, str_test_type) +@st.cache_data(show_spinner=False) def run_connections_lookup_query(str_project_code): str_schema = st.session_state["dbschema"] return dq.run_connections_lookup_query(str_schema, str_project_code) +@st.cache_data(show_spinner=False) def run_table_groups_lookup_query(str_project_code, str_connection_id=None, table_group_id=None): str_schema = st.session_state["dbschema"] return dq.run_table_groups_lookup_query(str_schema, str_project_code, str_connection_id, table_group_id) +@st.cache_data(show_spinner=False) def run_table_lookup_query(str_table_groups_id): str_schema = st.session_state["dbschema"] return dq.run_table_lookup_query(str_schema, str_table_groups_id) +@st.cache_data(show_spinner=False) def run_column_lookup_query(str_table_groups_id, str_table_name): str_schema = st.session_state["dbschema"] return dq.run_column_lookup_query(str_schema, str_table_groups_id, str_table_name) +@st.cache_data(show_spinner=False) def run_test_suite_lookup_query(str_table_groups_id, test_suite_name=None): str_schema = st.session_state["dbschema"] return dq.run_test_suite_lookup_by_tgroup_query(str_schema, str_table_groups_id, test_suite_name) diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 3df1558..ab78795 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -190,7 +190,7 @@ def show_record_detail( ) with right_column: - st.write("

", unsafe_allow_html=True) + # st.write("

", unsafe_allow_html=True) _, button_column = st.columns([0.2, 0.8]) with button_column: run_now_commands_tab, cli_commands_tab = st.tabs(["Test Suite Actions", "View CLI Commands"]) @@ -253,19 +253,13 @@ def show_run_test_generation(modal, selected): counts_msg = f"\n\nTests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}" if unlocked_edits_ct > 0: if unlocked_edits_ct > 1: - warning_msg = ( - "Manual changes have been made to tests in this Test Suite that have not been locked. " - ) + + warning_msg = "Manual changes have been made to auto-generated tests in this Test Suite that have not been locked. " else: - warning_msg = ( - "A manual change has been made to a test in this Test Suite that has not been locked. " - ) + warning_msg = "A manual change has been made to an auto-generated test in this Test Suite that has not been locked. " elif unlocked_test_ct > 0: warning_msg = "Auto-generated tests are present in this Test Suite that have not been locked. " - warning_msg = ( - f"{warning_msg}Generating tests now will overwrite all unlocked tests currently in the " - f"test suite with new tests based on the latest profiling.{counts_msg}" - ) + warning_msg = f"{warning_msg}Generating tests now will overwrite unlocked tests that can be auto-generated with new tests based on the latest profiling.{counts_msg}" with warning_container: st.warning(warning_msg) if unlocked_edits_ct > 0: From 1ca2d8086ddb1035f577711534cb2cda15bb646f Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 13 May 2024 09:00:08 -0400 Subject: [PATCH 18/19] fix(test generation): avoid deleting manual cat tests Tests generation was deleting test types that cannot be auto re- generated. --- testgen/template/generation/gen_delete_old_tests.sql | 1 + testgen/ui/queries/test_suite_queries.py | 8 +++++--- testgen/ui/services/test_suite_service.py | 4 ++-- testgen/ui/views/test_suites.py | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/testgen/template/generation/gen_delete_old_tests.sql b/testgen/template/generation/gen_delete_old_tests.sql index da4f55e..a0e9bdf 100644 --- a/testgen/template/generation/gen_delete_old_tests.sql +++ b/testgen/template/generation/gen_delete_old_tests.sql @@ -7,4 +7,5 @@ DELETE FROM test_definitions AND 'CAT' = t.run_type) WHERE d.table_groups_id = '{TABLE_GROUPS_ID}'::UUID AND d.test_suite = '{TEST_SUITE}' + AND t.selection_criteria IS NOT NULL AND COALESCE(d.lock_refresh, 'N') <> 'Y' ); diff --git a/testgen/ui/queries/test_suite_queries.py b/testgen/ui/queries/test_suite_queries.py index 73f08a3..8e45764 100644 --- a/testgen/ui/queries/test_suite_queries.py +++ b/testgen/ui/queries/test_suite_queries.py @@ -108,7 +108,7 @@ def get_test_suite_usage(schema: str, test_suite_names: list[str]) -> pd.DataFra return db.retrieve_data(sql) -def get_test_suite_refresh_check(schema, test_suite_name): +def get_test_suite_refresh_check(schema, table_groups_id, test_suite_name): sql = f""" SELECT COUNT(*) as test_ct, SUM(CASE WHEN COALESCE(d.lock_refresh, 'N') = 'N' THEN 1 ELSE 0 END) as unlocked_test_ct, @@ -116,8 +116,10 @@ def get_test_suite_refresh_check(schema, test_suite_name): FROM {schema}.test_definitions d INNER JOIN {schema}.test_types t ON (d.test_type = t.test_type) - WHERE d.test_suite = '{test_suite_name}' - AND t.run_type = 'CAT'; + WHERE d.table_groups_id = '{table_groups_id}'::UUID + AND d.test_suite = '{test_suite_name}' + AND t.run_type = 'CAT' + AND t.selection_criteria IS NOT NULL; """ return db.retrieve_data_list(sql)[0] diff --git a/testgen/ui/services/test_suite_service.py b/testgen/ui/services/test_suite_service.py index 45adc4e..a827657 100644 --- a/testgen/ui/services/test_suite_service.py +++ b/testgen/ui/services/test_suite_service.py @@ -44,11 +44,11 @@ def are_test_suites_in_use(test_suite_names): return not usage_result.empty -def get_test_suite_refresh_warning(test_suite_name): +def get_test_suite_refresh_warning(table_groups_id, test_suite_name): if not test_suite_name: return False schema = st.session_state["dbschema"] - row_result = test_suite_queries.get_test_suite_refresh_check(schema, test_suite_name) + row_result = test_suite_queries.get_test_suite_refresh_check(schema, table_groups_id, test_suite_name) test_ct = None unlocked_test_ct = None diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index ab78795..760ce4c 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -246,7 +246,7 @@ def show_run_test_generation(modal, selected): status_container = st.empty() test_ct, unlocked_test_ct, unlocked_edits_ct = test_suite_service.get_test_suite_refresh_warning( - selected_test_suite["test_suite"] + selected_test_suite["table_groups_id"], selected_test_suite["test_suite"] ) if test_ct: warning_msg = "" @@ -259,7 +259,7 @@ def show_run_test_generation(modal, selected): warning_msg = "A manual change has been made to an auto-generated test in this Test Suite that has not been locked. " elif unlocked_test_ct > 0: warning_msg = "Auto-generated tests are present in this Test Suite that have not been locked. " - warning_msg = f"{warning_msg}Generating tests now will overwrite unlocked tests that can be auto-generated with new tests based on the latest profiling.{counts_msg}" + warning_msg = f"{warning_msg}Generating tests now will overwrite unlocked tests subject to auto-generation based on the latest profiling.{counts_msg}" with warning_container: st.warning(warning_msg) if unlocked_edits_ct > 0: From 6f2fd59e24d48ff2549e5ad7cd5d6ac1af636306 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Tue, 14 May 2024 12:51:25 -0400 Subject: [PATCH 19/19] release: 2.0.0 -> 2.1.1 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 10a35ed..5954fcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "data-ops-testgen" -version = "2.0.0" +version = "2.1.1" description = "DataKitchen Inc. Data Quality Engine" urls = { "homepage" = "https://datakitchen.io" } authors = [ @@ -242,7 +242,7 @@ omit = ["tests/*", "templates/*"] skip_empty=true [tool.bumpver] -current_version = "2.0.0" +current_version = "2.1.1" version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]" commit_message = "release: {old_version} -> {new_version}" commit = true