From 8d549ca55ab718df5d976018477b348d4a35574e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20=C5=81awski?= Date: Tue, 26 Sep 2023 16:00:17 +0000 Subject: [PATCH 1/4] Add Categories import to Business Glossary import functionality for quickstart-lab. --- .../python/business-glossary-import/README.md | 57 +- .../bg_import/business_glossary_import.py | 208 ++- .../bg_import/categories_csv_parser.py | 229 +++ .../bg_import/category.py | 92 ++ .../bg_import/entry_type.py | 10 + .../bg_import/error.py | 20 +- .../bg_import/glossary.py | 562 +++++-- .../bg_import/import_mode.py | 10 + .../bg_import/import_types.py | 21 + .../bg_import/parse_utils.py | 39 +- .../bg_import/parser_types.py | 16 + .../bg_import/relation_type.py | 12 + .../bg_import/term.py | 30 +- .../{csv_parser.py => terms_csv_parser.py} | 53 +- .../tests/categories_csv_parser_test.py | 156 ++ .../bg_import/tests/category_test.py | 83 + .../bg_import/tests/error_test.py | 22 +- .../bg_import/tests/glossary_test.py | 1470 ++++++++++++----- .../bg_import/tests/parse_utils_test.py | 131 +- .../bg_import/tests/term_test.py | 26 + ...arser_test.py => terms_csv_parser_test.py} | 52 +- .../bg_import/tests/test_utils/mocks.py | 591 +++++-- .../bg_import/user_report.py | 165 +- .../bg_import/utils.py | 85 +- 24 files changed, 3222 insertions(+), 918 deletions(-) create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/category.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/entry_type.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_mode.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_types.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parser_types.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/relation_type.py rename dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/{csv_parser.py => terms_csv_parser.py} (82%) create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py create mode 100644 dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/category_test.py rename dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/{csv_parser_test.py => terms_csv_parser_test.py} (70%) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md index 2a909df..6b760ca 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md @@ -1,10 +1,11 @@ # Overview -`bg_import` is a utility that performs bulk import of terms into a Data Catalog business -glossary from a CSV file. To achieve that the CSV file is parsed and validated. -The resulting list of terms is then added into the target glossary via Data -Catalog API. If any errors occur at any stage of the process then an error -report is printed and import continues or completely stops depending on input flags. +`bg_import` is a utility that performs bulk import of categories and terms into +a Data Catalog business glossary from CSV files. To achieve that, the CSV files - one for +categories and one for terms - are parsed and validated. The resulting list of +categories and terms are then added into the target glossary via Data Catalog +API. If any errors occur at any stage of the process then an error report is +printed and import continues or completely stops depending on input flags. Business Glossary API is currently on private preview, and it needs to be enabled on the project for it to be used. @@ -12,17 +13,23 @@ enabled on the project for it to be used. ## Usage ``` -python3 bg_import/business_glossary_import.py +python3 bg_import/business_glossary_import.py --project= --group= --glossary= --location= [--import-mode={strict,clear}] - [--strict-parsing] + [--categories-csv=] + [--terms-csv={terms csv file}] [-h] ``` -Run `python3 bg_import/business_glossary_import.py -h` for description of individual arguments. +Currently `strict` and `clear` import mode are supported. The default +mode is `strict`. \ +Provide a terms CSV file by using `--terms-csv` argument, `terms csv file legacy` +is deprecated. \ +Run `python3 bg_import/business_glossary_import.py -h` for description of +individual arguments. ### Access token @@ -35,10 +42,32 @@ export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) ## CSV file schema -The source CSV file shall adhere to RFC4180 format. Each record in the file -represents a single term with the following values: +The source CSV files shall adhere to RFC4180 format. -`term_display_name,description,steward,tagged_assets,synonyms,related_terms` +### Categories CSV schema + +Each record in the categories CSV file represents a single category with the +following values: + +`category_display_name,description,steward,belongs_to_category` + +Where: + +* `category_display_name` (required): Unique name for the entry category. +* `description` (required): Plain text or rich text encoded as plain text + description for the category. +* `steward` (optional): List of data stewards for the current category, with + each steward separated by a comma (`,`). E.g.: `Data + Steward1, Data teward2"` +* `belongs_to_category` (optional): Display name of a category to which the + category belongs + +### Terms CSV schema + +Each record in the terms CSV file represents a single category with the +following values: + +`term_display_name,description,steward,tagged_assets,synonyms,related_terms,belongs_to_category` Where: @@ -51,12 +80,14 @@ Where: * `tagged_assets` (optional): List of asset names for assets explained by the current term, with each asset separated by a comma (`,`). If a specific field of the asset needs to be explained by the current term, and not the - asset as a whole, the field can be indicated by separating it from the - asset name with a colon (:) eg. `asset_name:field` + asset as a whole, the field can be indicated by separating it from the asset + name with a colon (:) eg. `asset_name:field` * `synonyms` (optional): List of terms that have a synonym relation with the current term, with each term separated by a comma (`,`) * `related_terms` (optional): List of terms that have a related-to relation with the current term, with each term separated by a comma (`,`) +* `belongs_to_category` (optional): Display name of a category to which the + term belongs In the case where a list of items inside a field contains the delimiter value comma (,) the field has to be escaped by using double quotes (" "). e.g. term 1, diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py index f8d82a0..4075753 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py @@ -1,9 +1,17 @@ +import argparse import sys -import csv_parser + + +import categories_csv_parser +import entry_type as entry_type_lib import glossary as dc_glossary import glossary_identification +import import_mode as import_mode_lib +import import_types import logging_utils +import parser_types +import terms_csv_parser import user_report import utils @@ -13,16 +21,8 @@ def main() -> None: args = utils.get_arguments() - - # Verify access token is available - if not utils.access_token_exists(): - logger.error("Environment variable GCLOUD_ACCESS_TOKEN doesn't exist.") - sys.exit(1) - - # Verify the provided csv file exists - if not utils.csv_file_exists(args.csv): - logger.error("The provided CSV file path doesn't exist.") - sys.exit(1) + utils.validate_args(args) + import_mode = utils.get_import_mode(args) # Create glossary using provided information try: @@ -38,53 +38,169 @@ def main() -> None: logger.error("Can't proceed with import. Please select a valid glossary.") utils.end_program_execution() - logger.info("Parsing input CSV...") - parser_output, parse_errors, lines_read = csv_parser.parse_glossary_csv( - args.csv - ) - - if parse_errors: - utils.display_parsing_errors(parse_errors) - if args.strict_parsing: - utils.end_program_execution() - + parsers_results = _parse_all_csv_files(args) + _print_parsing_errors(args, parsers_results) if not glossary.is_glossary_empty(): - if args.import_mode == "strict": - logger.error( - "Can't proceed with import in strict mode. Please select a more " - "permissive import mode or provide an empty glossary." - ) - utils.end_program_execution() - - elif args.import_mode == "clear": - while True: - confirm = input( - "Are you sure you want to clear the target glossary? (y)es/(n)o:" - ) - if confirm.lower() == "yes" or confirm.lower() == "y": - break - elif confirm.lower() == "no" or confirm.lower() == "n": - utils.end_program_execution() + _handle_non_empty_glossary(import_mode, glossary) - if not glossary.clear_glossary(): - logger.error("Could not clear the target glossary.") - utils.end_program_execution() - - logger.info("Importing CSV terms into Business Glossary...") - imported_terms, imported_relations, import_errors = glossary.import_glossary( - parser_output + imported_entries, imported_relations, import_errors = ( + _import_glossary_entries(glossary, parsers_results) ) + lines_read = _lines_read(parsers_results) user_report.print_report( lines_read, - imported_terms, + imported_entries, imported_relations, - import_errors + import_errors, ) if import_errors: logger.warning("Import script execution finalized with some errors.") sys.exit(1) +def _lines_read( + parsers_results: dict[entry_type_lib.EntryType, parser_types._ParserReturnType] +) -> dict[entry_type_lib.EntryType, int]: + """Returns lines read for each EntryType.""" + lines_read = {} + + for entry_type in [ + entry_type_lib.EntryType.TERM, + entry_type_lib.EntryType.CATEGORY, + ]: + if entry_type in parsers_results: + _, _, entry_lines_read = parsers_results[entry_type] + lines_read[entry_type] = entry_lines_read + + return lines_read + + +def _import_glossary_entries( + glossary: dc_glossary.Glossary, + parsers_results: dict[entry_type_lib.EntryType, parser_types._ParserReturnType], +) -> import_types._ImportResult: + """Unpacks parsed terms and categories and imports them into business glossary. + + Args: + glossary: glossary object used to import terms and categories + parsers_results: dictionary indicating parser result for entry type parser + result consists of parsed_entries list, parse_errors list and line_parsed + integer. + + Returns: + A tuple consisting of: + * dictionary mapping EntryType to list of imported Entries (Terms or + Categories) + * dictionary mapping EntryType to list of imported relations + * lit of import errors + """ + + parsed_terms = None + parsed_categories = None + if entry_type_lib.EntryType.TERM in parsers_results: + parsed_terms, _, _ = parsers_results[entry_type_lib.EntryType.TERM] + if entry_type_lib.EntryType.CATEGORY in parsers_results: + parsed_categories, _, _ = parsers_results[entry_type_lib.EntryType.CATEGORY] + + entries_to_import = "" + if parsed_terms: + entries_to_import += "terms and " + if parsed_categories: + entries_to_import += "categories and " + entries_to_import = entries_to_import.removesuffix(" and ") + + logger.info("Importing CSV %s into Business Glossary...", entries_to_import) + return glossary.import_glossary( + terms=parsed_terms, categories=parsed_categories + ) + + +def _handle_non_empty_glossary( + import_mode: import_mode_lib.ImportMode, glossary: dc_glossary.Glossary +) -> None: + """Handles non-empty glossary depending on the import mode. + + Args: + import_mode: strict,clear + glossary: glossary object + """ + # If the operation mode is Strict, we check that the glossary exists and is + # empty, otherwise we log an error and finish + if import_mode == import_mode_lib.ImportMode.STRICT: + logger.error( + "Can't proceed with import in strict mode. Please select a more " + "permissive import mode or provide an empty glossary." + ) + utils.end_program_execution() + + elif import_mode == import_mode_lib.ImportMode.CLEAR: + while True: + confirm = input( + "Are you sure you want to clear the target glossary? (y)es/(n)o:" + ) + if confirm.lower() == "yes" or confirm.lower() == "y": + break + elif confirm.lower() == "no" or confirm.lower() == "n": + utils.end_program_execution() + + if not glossary.clear_glossary(): + logger.error("Could not clear the target glossary.") + utils.end_program_execution() + + +def _print_parsing_errors( + args: argparse.Namespace, + parsers_results: dict[entry_type_lib.EntryType, parser_types._ParserReturnType], +) -> None: + if any_errors(parsers_results): + for _, parse_errors, _ in parsers_results.values(): + utils.display_parsing_errors(parse_errors) + if args.strict_parsing: + utils.end_program_execution() + + +def _parse_all_csv_files( + args: argparse.Namespace, +) -> dict[entry_type_lib.EntryType, parser_types._ParserReturnType]: + """Parse all CSV files. + + Args: + args: script run arguments + + Returns: + dictionary mapping EntryType to _ParserReturnType (a tuple of list of + successfully parsed terms, a list of errors and the number of lines we read + in the CSV). + """ + parsers_results = {} + terms_csv = ( + args.terms_csv if args.terms_csv is not None else args.terms_csv_legacy + ) + if terms_csv: + logger.info("Parsing terms input CSV...") + parsers_results[entry_type_lib.EntryType.TERM] = ( + terms_csv_parser.parse_glossary_csv(terms_csv) + ) + + if args.categories_csv: + logger.info("Parsing categories input CSV...") + parsers_results[entry_type_lib.EntryType.CATEGORY] = ( + categories_csv_parser.parse_glossary_csv(args.categories_csv) + ) + + if not parsers_results: + logger.error("Could not parse any records.") + utils.end_program_execution() + + return parsers_results + + +def any_errors( + parsers_results: dict[entry_type_lib.EntryType, parser_types._ParserReturnType] +) -> bool: + return any([err for _, err, _ in parsers_results.values()]) + + if __name__ == "__main__": main() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py new file mode 100644 index 0000000..4164732 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py @@ -0,0 +1,229 @@ +"""Provides functionality of reading business glossary categories from a CSV file. + +Typical usage example: + categories, errors, lines_read = read_glossary_csv() +""" + +import csv +import dataclasses +from typing import Any + +import category as bg_category +import entry_type +import error +import parse_utils +import parser_types + + +""" + Each attribute parser is represented as a tuple consisting of: + field_name: Name of the field to parse. + parser_function: Pointer to a parsing function for the field. + is_optional_field: Boolean representing if the field is optional. +""" +_ATTRIBUTE_PARSERS: list[tuple[str, parser_types._ParseFn[Any], bool]] = [ + ("display_name", parse_utils.parse_category_str, False), + ("description", parse_utils.parse_category_str, False), + ("data_stewards", parse_utils.parse_category_data_stewards, True), + ("belongs_to_category", parse_utils.parse_category_str, True), +] + +_MAX_DISPLAY_NAME_LENGTH = 200 +_NON_ALLOWED_DISPLAY_NAME_CHARACTERS = ("\n",) + + +@dataclasses.dataclass(frozen=True) +class CategoryEntry: + line: int + category: bg_category.Category + + # Allow unpacking as a tuple + def __iter__(self): + return iter((self.line, self.category)) + + +def parse_glossary_csv( + path: str, +) -> parser_types._ParserReturnType: + """Reads CSV file containing business glossary categories. + + Args: + path: Path of a CSV file to read. + + Returns: + _ParserReturnType - a tuple of list of successfully parsed categories, + a list of errors and the number of lines we read in the CSV. + """ + + categories = {} + errors = [] + lines_read = 0 + + # Set where we track categories that appeared previously in the glossary. + # Duplicated categories will be recorded as an error. + tracked_categories = set() + try: + with open(path) as csv_file: + csv_reader = csv.reader( + csv_file, delimiter=",", quotechar='"', skipinitialspace=True + ) + for line_idx, record in enumerate(csv_reader): + if not record: + continue + category, category_errors = parse_category( + line_idx, record, tracked_categories + ) + if category_errors: + errors.extend(category_errors) + else: + categories[line_idx + 1] = category + lines_read += 1 + except FileNotFoundError: + errors.append( + error.ParseError( + entry_type.EntryType.CATEGORY, message=f"{path} could not be found." + ) + ) + + return categories, errors, lines_read + + +def _validate_category( + category: bg_category.Category, tracked_categories: set[str] +) -> parser_types._ParseErrors: + """Validates a business glossary category. + + Performs the following tests: + - The category is unique in the CSV + - Display name is not empty + - Description is not empty + + Args: + category: Category + tracked_categories: Set of categories seen so far in the CSV + + Returns: + ParseErrors + """ + errors = [] + + # If the category display name is empty we record an error + if not category.display_name: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="The display name for the category is empty.", + column=1, + ) + errors.append(err) + + # If the category description is empty we record an error + if not category.description: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="The description for the category is empty.", + column=2, + ) + errors.append(err) + + if category.display_name: + # If the category has appeared before in the CSV we record an error. + if category.display_name.lower() in tracked_categories: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="The category is duplicated in the CSV.", + column=1, + resources=[category.display_name], + ) + errors.append(err) + + if len(category.display_name) > _MAX_DISPLAY_NAME_LENGTH: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="The category's display name is too big.", + column=1, + resources=[category.display_name], + ) + errors.append(err) + + for character in _NON_ALLOWED_DISPLAY_NAME_CHARACTERS: + if character in category.display_name: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="Unallowed character in display name.", + column=1, + resources=[category.display_name], + ) + errors.append(err) + + return errors + + +def parse_category( + line_idx: int, record: list[str], tracked_categories: set[str] +) -> parser_types._ParseResult[bg_category.Category]: + """Parses a business glossary category. + + Args: + line_idx: Index of the line where the category appears in the CSV. + record: A list of category attributes in order conforming to the CSV schema. + tracked_categories: Set of previously seen display names. + + Returns: + A tuple of parsed category and a list of errors. + """ + attributes = [] + errors = [] + + for i, (attr_name, parse_fn, is_optional_field) in enumerate( + _ATTRIBUTE_PARSERS + ): + if i >= len(record): + # Add the default value to cover for the missing field + default_value, _ = parse_fn("") # pylint:disable=not-callable + attributes.append(default_value) + # If the field is not mandatory we can skip creating a ParseError + if not is_optional_field: + err = error.ParseError( + entry_type.EntryType.CATEGORY, + message="Missing field", + line=line_idx + 1, + column=i + 1, + record=record, + resources=[attr_name], + ) + errors.append(err) + continue + + value, attr_errors = parse_fn(record[i]) # pylint:disable=not-callable + attributes.append(value) + for err in attr_errors: + err.line = line_idx + 1 + err.column = i + 1 + err.record = record + err.resources.append(attr_name) + errors.extend(attr_errors) + + ( + display_name, + description, + data_stewards, + belongs_to_category, + *_, + ) = attributes + + category = bg_category.Category( + display_name, + description, + data_stewards, + belongs_to_category, + ) + + validation_errors = _validate_category(category, tracked_categories) + for err in validation_errors: + err.line = line_idx + 1 + err.record = record + if category.display_name: + tracked_categories.add(category.display_name.lower()) + errors.extend(validation_errors) + + return category, errors diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/category.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/category.py new file mode 100644 index 0000000..3211636 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/category.py @@ -0,0 +1,92 @@ +"""Dataclass for the Category type. + +A category represents an entry in a business glossary. Category can aggregate +terms and other categories into hierarchy via belongs_to relationship. Term and +category may belong to zero or one category. One category can aggregate many +terms and categories. + +Typical usage example: + stewards = ["John Doe", "Lee"] + category = Category("PII", "A Personally Identifiable Information.", + data_stewards=stewards) +""" + +from __future__ import annotations +import random +import re +import string +from typing import Any + + +class Category: + """Initializes an instance of Category. + + Attributes: + display_name: A string indicating the display name for the category. + description: A string containing a rich-text description of the category, + encoded as plain text. + data_stewards: A list of strings representing data stewards for this + category. + belongs_to_category: A string indicating the display name of another + category to which this category belongs to + category_id: A string containing a unique identifier for the category in DC. + """ + + def __init__( + self, + display_name: str, + description: str, + data_stewards: list[str] | None = None, + belongs_to_category: str | None = None, + force_category_id: str | None = None, + ): + self.display_name = display_name + self.description = description + self.data_stewards = [] if data_stewards is None else data_stewards + self.belongs_to_category = belongs_to_category + self.category_id = force_category_id or self._generate_category_id() + + def __repr__(self): + return ( + f"Category [{self.display_name} : {self.description} :" + f" {self.data_stewards} : {self.belongs_to_category}]" + ) + + def _generate_category_id(self): + """Unique glossary category ID.""" + if not self.display_name: + return "" + infix = re.sub(r"[^a-zA-Z0-9_]", "_", self.display_name).lower() + prefix = "_" if infix[0].isdigit() else "" + suffix = "".join( + random.choices(string.ascii_lowercase + string.digits, k=7) + ) + return f"{prefix}{infix}{suffix}" + + @classmethod + def from_dict(cls, entry: dict[str, Any]) -> Category | None: + """Creates a category instance from a category entry in DataCatalog. + + Args: + entry: Dictionary containing the category contents as returned by Data + Catalog. + + Returns: + Category. + """ + + def _get_category_id_from_resource_path(resource: str) -> str: + return resource.split("/")[-1] + + # Parse entry UID, display_name, description - all of them are non-optional + try: + uid = _get_category_id_from_resource_path(entry["name"]) + display_name = entry["displayName"] + description = entry["coreAspects"]["business_context"]["jsonContent"][ + "description" + ] + except KeyError: + return None + + category = Category(display_name, description, force_category_id=uid) + return category diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/entry_type.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/entry_type.py new file mode 100644 index 0000000..f208b0d --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/entry_type.py @@ -0,0 +1,10 @@ +"""Enum for glossary entry type.""" +import enum + + +@enum.unique +class EntryType(enum.Enum): + """Entry type.""" + + CATEGORY = "CATEGORY" + TERM = "TERM" diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py index 124fba7..759b61f 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py @@ -1,16 +1,19 @@ """Auxiliary classes to encapsulate errors. Typical usage example: - error = TermImportError(line=1, + error = EntryImportError( + entry_type=EntryType.TERM, + line=1, column=1, resources=["Term 1", "Term 2"], operation="create_synonym_relationship") print(error.to_string()) - """ import abc +import entry_type as entry_type_lib + _MAX_CHARS_PER_LINE = 120 @@ -18,6 +21,8 @@ class Error(abc.ABC): """Base class for Error. Attributes: + entry_type: An enum containing the type of the record in the CSV (e.g. TERM, + CATEGORY). line: An integer containing the line of the record in the CSV. column: An integer containing the column of the error in the CSV. resources: A list of the resources (terms, FQNs, entries, etc) that caused @@ -30,12 +35,14 @@ class Error(abc.ABC): def __init__( self, + entry_type: entry_type_lib.EntryType, line: int, column: int, resources: list[str], message: str | None = None, record: list[str] | None = None, ): + self.entry_type = entry_type self.line = line self.column = column self.resources = resources @@ -49,6 +56,7 @@ def __repr__(self) -> str: def to_string(self) -> str: """Generates a string containing details on the error.""" err = [] + err.append(f"{self.entry_type.value}") if self.line >= 1: err.append(f"Line {self.line}") if self.column >= 1: @@ -128,16 +136,17 @@ class ParseError(Error): def __init__( self, + entry_type: entry_type_lib.EntryType, message: str, line: int = -1, column: int = -1, resources: list[str] | None = None, record: list[str] | None = None, ): # pylint: disable=useless-parent-delegation - super().__init__(line, column, resources or [], message, record) + super().__init__(entry_type, line, column, resources or [], message, record) -class TermImportError(Error): +class EntryImportError(Error): """Initializes an instance of ImportError. ImportError objects are populated during the term import phase. @@ -145,6 +154,7 @@ class TermImportError(Error): def __init__( self, + entry_type: entry_type_lib.EntryType, line: int, resources: list[str], message: str | None = None, @@ -152,5 +162,5 @@ def __init__( record: list[str] | None = None, ): assert len(resources) >= 1 - super().__init__(line, -1, resources, message, record) + super().__init__(entry_type, line, -1, resources, message, record) self.operation = operation diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py index ec1e910..822ca2e 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py @@ -1,32 +1,26 @@ """Functions to get the state of glossary.""" -import enum import multiprocessing import re from typing import Any, Iterable, TypeVar import api_call_utils +import category as bg_category +import entry_type as entry_type_lib import error import glossary_identification +import import_types import logging_utils +import relation_type import requests import term as bg_term +import user_report +import utils -_T = TypeVar('_T') -_CreatedRelationship = tuple[str, str, _T] - logger = logging_utils.get_logger() -class RelationshipType(enum.Enum): - """Enum containing the types of relationships between terms. - """ - SYNONYMOUS = 'is_synonymous_to' - RELATED = 'is_related_to' - DESCRIBED = 'is_described_by' - - class Glossary: """Instance of a Business Glossary. @@ -37,12 +31,13 @@ class Glossary: def __init__(self, config: glossary_identification.GlossaryId): self._config = config self._glossary_endpoint = Glossary._configure_endpoint_url(self._config) + self._category_cache: dict[str, bg_category.Category] = {} self._term_cache: dict[str, bg_term.Term] = {} self._glossary_uid = None # Load UID of the target glossary self._load_glossary_uid() # Store a mapping from display names to Term entries existing in DC - self._populate_term_cache() + self._populate_caches() @classmethod def _configure_endpoint_url( @@ -80,20 +75,30 @@ def _load_glossary_uid(self) -> None: def is_glossary_empty(self) -> bool: """Verify if targeted glossary is empty.""" - if not self._term_cache: + has_categories = bool(self._category_cache) + has_terms = bool(self._term_cache) + if not has_categories and not has_terms: logger.info( f'Glossary with ID: {self._config.glossary_id} does not have any' - ' terms.' + ' categories nor terms.' ) return True - logger.info( - f'Glossary with ID: {self._config.glossary_id} already has some terms.' - ) + if has_categories: + logger.info( + f'Glossary with ID: {self._config.glossary_id} already has some' + ' categories.' + ) + + if has_terms: + logger.info( + f'Glossary with ID: {self._config.glossary_id} already has some' + ' terms.' + ) return False - def _populate_term_cache(self) -> None: - """Populates an internal cache of terms existing in the target glossary.""" + def _populate_caches(self): + """Populates internal caches of terms and categories existing in the target glossary.""" endpoint = f'{self._glossary_endpoint}/entries?view=FULL' keep_reading, page_token = True, None while keep_reading: @@ -117,27 +122,53 @@ def _populate_term_cache(self) -> None: if not page_token: keep_reading = False - # Read terms in the current page - for entry in response['json'].get('entries'): - if entry['entryType'] != 'glossary_term': - continue - term = bg_term.Term.from_json(entry) - if term: - self._term_cache[term.display_name] = term - else: - logger.warning(f'Could not import term from {entry}') + self._populate_term_cache(response) + self._populate_category_cache(response) - def _create_glossary_term(self, term: bg_term.Term) -> dict[str, Any]: - """Create new term in target glossary. + def _populate_term_cache(self, response: dict[str, Any]) -> None: + """Populates an internal cache of terms existing in the target glossary.""" + for entry in response['json'].get('entries'): + if entry['entryType'] != 'glossary_term': + continue + term = bg_term.Term.from_dict(entry) + if term: + self._term_cache[term.display_name] = term + else: + logger.warning(f'Could not import term from {entry}') + + def _populate_category_cache(self, response: dict[str, Any]) -> None: + """Populates an internal cache of categories existing in the target glossary.""" + for entry in response['json'].get('entries'): + if entry['entryType'] != 'glossary_category': + continue + category = bg_category.Category.from_dict(entry) + if category: + self._category_cache[category.display_name] = category + else: + logger.warning(f'Could not import category from {entry}') + + def _create_glossary_entry( + self, entry: bg_term.Term | bg_category.Category + ) -> dict[str, Any]: + """Create new entry (term or category) in target glossary. Args: - term: Term data object. + entry: entry data object - Term or Category. Returns: Dictionary with response and response code. """ + endpoint = ( + f'{self._glossary_endpoint}/entries?entry_id={entry.category_id}' + if isinstance(entry, bg_category.Category) + else f'{self._glossary_endpoint}/entries?entry_id={entry.term_id}' + ) - endpoint = f'{self._glossary_endpoint}/entries?entry_id={term.term_id}' + entry_type = ( + 'glossary_category' + if isinstance(entry, bg_category.Category) + else 'glossary_term' + ) dest_entry_name = ( f'{self._glossary_endpoint}/entries/{self._config.glossary_id}'.replace( @@ -150,14 +181,14 @@ def _create_glossary_term(self, term: bg_term.Term) -> dict[str, Any]: endpoint, self._config.project_id, { - 'entry_type': 'glossary_term', - 'display_name': term.display_name, + 'entry_type': entry_type, + 'display_name': entry.display_name, 'core_aspects': { 'business_context': { 'aspect_type': 'business_context', 'json_content': { - 'description': term.description, - 'contacts': term.data_stewards, + 'description': entry.description, + 'contacts': entry.data_stewards, }, } }, @@ -168,6 +199,21 @@ def _create_glossary_term(self, term: bg_term.Term) -> dict[str, Any]: }, ) + def _create_glossary_categories( + self, categories: dict[int, bg_category.Category] + ) -> Iterable[Any]: + """Create new categories in the target glossary. + + Args: + categories: Dictionary mapping from lines in the csv to categories + + Returns: + Iterable containing errors + """ + tasks = [(category,) for category in categories.values()] + + return Glossary._parallelize(self._create_glossary_entry, tasks) + def _create_glossary_terms( self, terms: dict[int, bg_term.Term] ) -> Iterable[Any]: @@ -178,48 +224,110 @@ def _create_glossary_terms( Returns: Iterable containing errors - """ tasks = [(term,) for term in terms.values()] - return Glossary._parallelize(self._create_glossary_term, tasks) + return Glossary._parallelize(self._create_glossary_entry, tasks) + + def _get_entry_from_cache( + self, display_name: str, entry_type: entry_type_lib.EntryType + ) -> bg_term.Term | bg_category.Category | None: + """Returns entry (term or category) based on display_name and entry_type. + + Args: + display_name: string indicating display_name of the entry + entry_type: EntryType enum indicating entry type + + Returns: + """ + if entry_type == entry_type_lib.EntryType.CATEGORY: + return self._category_cache.get(display_name) + elif entry_type == entry_type_lib.EntryType.TERM: + return self._term_cache.get(display_name) + return None + + def _get_entry_id_from_cache( + self, display_name: str, entry_type: entry_type_lib.EntryType + ) -> str | None: + entry = self._get_entry_from_cache(display_name, entry_type) + if isinstance(entry, bg_term.Term): + return entry.term_id + elif isinstance(entry, bg_category.Category): + return entry.category_id + return None def _is_relationship_valid( - self, src: str, dst: str, relationship_type: RelationshipType + self, + src_display_name: str, + src_type: entry_type_lib.EntryType, + dst_display_name: str, + dst_type: entry_type_lib.EntryType, + relationship_type: relation_type.RelationshipType, ) -> tuple[bool, str | None]: """Check if both terms in a relationship exist in the cache of terms. Args: - src: First term of the relationship. - dst: Second term of the relationship. - relationship_type: RelationshipType. + src_display_name: Display name of the first entry of the relationship. + src_type: EntryType of the source entry. + dst_display_name: Display name of the second entry of the relationship. + dst_type: EntryType of the destination entry. + relationship_type: RELATED, SYNONYMOUS, DESCRIBED or BELONGS_TO. Returns: A boolean value specifying if the relationship was created. An optional error message containing the reason why the relationship is not valid. """ - if src == dst: + src_entry = self._get_entry_from_cache(src_display_name, src_type) + dst_entry = self._get_entry_from_cache(dst_display_name, dst_type) + + # Described is a relation between asset (not present in the internal cache) + # and term. We want to check validity of this special relation as first. + if ( + relationship_type == relation_type.RelationshipType.DESCRIBED + and src_type == entry_type_lib.EntryType.TERM + and dst_type == entry_type_lib.EntryType.TERM + and dst_entry + ): + return True, None + if relationship_type == relation_type.RelationshipType.DESCRIBED and ( + src_type != entry_type_lib.EntryType.TERM + or dst_type != entry_type_lib.EntryType.TERM + ): err = ( f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src}" and itself.' + f' between "{src_display_name}" and "{dst_display_name}" because' + f' "{src_type}" is not a term or "{dst_type}" is not a term.' ) return False, err - if ( - relationship_type != RelationshipType.DESCRIBED - and src not in self._term_cache - ): + if src_display_name == dst_display_name and src_type == dst_type: err = ( f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src}" and "{dst}" because "{src}" doesn\'t exist in the' - ' CSV.' + f' between "{src_display_name}" and itself.' ) return False, err - elif dst not in self._term_cache: + if not src_entry: err = ( f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src}" and "{dst}" because "{dst}" doesn\'t exist in the' - ' CSV.' + f' between "{src_display_name}" and "{dst_display_name}" because' + f' "{src_display_name}" doesn\'t exist in the CSV.' + ) + return False, err + elif not dst_entry: + err = ( + f'Won\'t be able to create a "{relationship_type.value}" relation' + f' between "{src_display_name}" and "{dst_display_name}" because' + f' "{dst_display_name}" doesn\'t exist in the CSV.' + ) + return False, err + elif ( + relationship_type == relation_type.RelationshipType.BELONGS_TO + and dst_type != entry_type_lib.EntryType.CATEGORY + ): + err = ( + f'Won\'t be able to create a "{relationship_type.value}" relation' + f' between "{src_display_name}" and "{dst_display_name}" because' + f' "{dst_display_name}" is not a category.' ) return False, err return True, None @@ -254,43 +362,54 @@ def _parse_entry_path(cls, entry_path) -> tuple[str | None, str | None]: def _create_relationship( self, - src: str, - dst: str, - relationship_type: RelationshipType - ) -> error.TermImportError | None: - """Create a relationship between two terms in DC Business Glossary. + src_display_name: str, + src_type: entry_type_lib.EntryType, + dst_display_name: str, + dst_type: entry_type_lib.EntryType, + relationship_type: relation_type.RelationshipType, + ) -> error.EntryImportError | None: + """Create a relationship between two entries in DC Business Glossary. Args: - src: source end of the relationship. - dst: destination end of the relationship. - relationship_type: RELATED, SYNONYMOUS or DESCRIBED. + src_display_name: display name of the source end of the relationship. + src_type: EntryType of the source end. + dst_display_name: display name of the destination end of the relationship. + dst_type: EntryType of the destination end. + relationship_type: RELATED, SYNONYMOUS, DESCRIBED or BELONGS_TO. + Returns: An error, if any. """ valid, error_msg = self._is_relationship_valid( - src, dst, relationship_type + src_display_name, + src_type, + dst_display_name, + dst_type, + relationship_type, ) if not valid: - return error.TermImportError( + return error.EntryImportError( + src_type, -1, - [src, dst], + [src_display_name, dst_display_name], error_msg, - operation=f'create_{relationship_type.value}_relationship', + operation=f'create_{relationship_type.value}_relationship_validation', ) - if relationship_type == RelationshipType.DESCRIBED: + if relationship_type == relation_type.RelationshipType.DESCRIBED: # If the asset has a field or column specified we extract it # e.g. for projects/123/locations/us-central1/entryGroups/abc/ # entries/fileset:field1, # we want to split the asset # [projects/123/locations/us-central1/entryGroups/abc/entries/fileset] # from the subfield [field1] - entry, source_column = Glossary._parse_entry_path(src) + entry, source_column = Glossary._parse_entry_path(src_display_name) if entry is None: - return error.TermImportError( + return error.EntryImportError( + src_type, -1, - [src], + [src_display_name], ( 'Resource does not conform with the expected ' '"projects/{project_id}/locations/{location}/entryGroups/' @@ -301,19 +420,35 @@ def _create_relationship( # For assets described by a term, we use the asset name after extracting # the field or column (if any) + endpoint = f'https://datacatalog.googleapis.com/v2/{entry}/relationships' + else: + # For other entries, we use the internal id + src_entry_id = self._get_entry_id_from_cache(src_display_name, src_type) + if not src_entry_id: + return error.EntryImportError( + src_type, + -1, + [src_display_name, dst_display_name], + f'Source entry {src_display_name} not found.', + operation=f'create_{relationship_type.value}_relationship', + ) endpoint = ( - 'https://datacatalog.googleapis.com/' - f'v2/{entry}/relationships' + f'{self._glossary_endpoint}/entries/{src_entry_id}/relationships' ) - else: - # For other terms, we use the internal term_id - endpoint = f'{self._glossary_endpoint}/entries/{self._term_cache[src].term_id}/relationships' # Source column is not used source_column = None + dst_entry_id = self._get_entry_id_from_cache(dst_display_name, dst_type) + if not dst_entry_id: + return error.EntryImportError( + src_type, + -1, + [src_display_name, dst_display_name], + f'Destination entry {dst_display_name} not found.', + operation=f'create_{relationship_type.value}_relationship', + ) dest_entry_name = ( - f'{self._glossary_endpoint}/entries/' - f'{self._term_cache[dst].term_id}' + f'{self._glossary_endpoint}/entries/{dst_entry_id}' ).replace('https://datacatalog.googleapis.com/v2/', '') # JSON content of the request @@ -325,7 +460,10 @@ def _create_relationship( # If a field or column in the endpoint was specified for a is_described_by # relationship, we express it by using the source_column field of the # payload - if relationship_type == RelationshipType.DESCRIBED and source_column: + if ( + relationship_type == relation_type.RelationshipType.DESCRIBED + and source_column + ): request_body['source_column'] = source_column ret = api_call_utils.fetch_api_response( @@ -337,38 +475,51 @@ def _create_relationship( err = ret['error_msg'] if err: - return error.TermImportError( + return error.EntryImportError( + src_type, -1, - [src, dst], + [src_display_name, dst_display_name], message=err, operation=f'create_{relationship_type.value}_relationship', ) def _create_relationships( self, - related_terms: set[tuple[str, str]], - relationship_type: RelationshipType + src_type: entry_type_lib.EntryType, + dst_type: entry_type_lib.EntryType, + related_entries: set[tuple[str, str]], + relationship_type: relation_type.RelationshipType, ) -> tuple[ - list[_CreatedRelationship[RelationshipType]], - list[error.TermImportError] - ]: - """Create a relationship between two terms in DC Business Glossary. + list[import_types._CreatedRelationship[relation_type.RelationshipType]], + list[error.EntryImportError], + ]: + """Create a relationship between two entries in DC Business Glossary. Args: - related_terms: Set of tuples containing the terms to create the + src_type: EntryType of source entries + dst_type: EntryType of destination entries + related_entries: Set of tuples containing the entries to create the relationship for - relationship_type: RelationshipType.SYNONYMOUS, RelationshipType.RELATED - or RelasionshipType.DESCRIBED + relationship_type: SYNONYMOUS, RELATED, DESCRIBED or BELONGS_TO + Returns: - List of TermImportError + List of EntryImportError """ - errors: list[error.TermImportError] = [] - successful_relations: list[_CreatedRelationship[RelationshipType]] = [] - if not related_terms: + errors: list[error.EntryImportError] = [] + successful_relations: list[ + import_types._CreatedRelationship[relation_type.RelationshipType] + ] = [] + if not related_entries: return successful_relations, errors - logger.info(f'Adding {relationship_type.value} relations between terms...') - tasks = [(src, dst, relationship_type,) for src, dst in related_terms] + logger.info( + f'Adding {relationship_type.value} relations between' + f' {src_type.value} and {dst_type.value} entries...' + ) + tasks = [ + (src, src_type, dst, dst_type, relationship_type) + for src, dst in related_entries + ] ret = Glossary._parallelize(self._create_relationship, tasks) @@ -376,7 +527,9 @@ def _create_relationships( if err: errors.append(err) else: - successful_relations.append(task) + src, _, dst, _, _ = task + successful_relation = (src, dst, relationship_type) + successful_relations.append(successful_relation) return successful_relations, errors @@ -387,25 +540,165 @@ def _parallelize(cls, task, params): return results def import_glossary( + self, + terms: dict[int, bg_term.Term] | None, + categories: dict[int, bg_category.Category] | None, + ) -> import_types._ImportResult: + """Imports categories, terms and relationships to Data Catalog Business Glossary. + + Args: + terms: dictionary indicating Term object for related line number + categories: dictionary indicating Category object for related line number + + Returns: + A tuple consisting of: + * dictionary mapping EntryType to list of imported Entries (Terms or + Categories) + * dictionary mapping EntryType to list of imported relations + * lit of import errors + """ + imported_entries = {} + imported_relations = { + entry_type_lib.EntryType.TERM: [], + entry_type_lib.EntryType.CATEGORY: [], + } + import_errors = [] + not_imported_category_belongs_to_category_relations = set() + + # Import categories if they were parsed + if categories is not None: + ( + imported_categories, + not_imported_belongs_to_relations, + categories_import_errors, + ) = self._import_glossary_categories(categories) + imported_entries[entry_type_lib.EntryType.CATEGORY] = imported_categories + import_errors.extend(categories_import_errors) + not_imported_category_belongs_to_category_relations.update( + not_imported_belongs_to_relations + ) + if categories_import_errors: + error_log_suffix = ' No terms were imported.' if terms else '' + logger.error( + 'Errors occurred during categories import.%s', error_log_suffix + ) + user_report.print_report_for_erronous_categories_import( + imported_categories, categories_import_errors + ) + utils.end_program_execution() + + # Import terms if they were parsed + if terms is not None: + ( + imported_terms, + imported_relations_term_to_term, + terms_import_errors, + ) = self._import_glossary_terms(terms) + imported_entries[entry_type_lib.EntryType.TERM] = imported_terms + imported_relations[entry_type_lib.EntryType.TERM].extend( + imported_relations_term_to_term + ) + import_errors.extend(terms_import_errors) + + # Import category belongs_to category relations as second (due to hierarhcy + # limit) + ( + imported_relations_category_to_category, + category_to_category_relations_import_error, + ) = self._create_relationships( + src_type=entry_type_lib.EntryType.CATEGORY, + dst_type=entry_type_lib.EntryType.CATEGORY, + related_entries=not_imported_category_belongs_to_category_relations, + relationship_type=relation_type.RelationshipType.BELONGS_TO, + ) + imported_relations[entry_type_lib.EntryType.CATEGORY].extend( + imported_relations_category_to_category + ) + import_errors.extend(category_to_category_relations_import_error) + + return (imported_entries, imported_relations, import_errors) + + def _import_glossary_categories( + self, categories: dict[int, bg_category.Category] + ) -> tuple[ + list[bg_category.Category], + set[tuple[str, str]], + list[error.EntryImportError], + ]: + """Imports categories into Data Catalog Business Glossary. + + Args: + categories: List of categories to add to the glossary. + + Returns: + A tuple containing: + * a list of successfully imported categories + * a set of unimported category belongs_to category relations, + * a list of import errors + """ + category_import_errors: list[error.EntryImportError] = [] + # We want to import category belongs_to category relations later. + # Due to hierarchy height limit we allow terms to create belongs_to + # relationships first. + not_imported_belongs_to_relations = set() + + # Create category entries + ret = self._create_glossary_categories(categories) + + # Gather category creation results and prepare relationships + for elem_order, response in zip(categories.items(), ret): + line_num, category = elem_order + err = response['error_msg'] + + if err: + new_error = error.EntryImportError( + entry_type_lib.EntryType.CATEGORY, + line_num, + [category.display_name], + message=err, + operation='add_new_category', + ) + category_import_errors.append(new_error) + else: + # Populate internal category cache + self._category_cache[category.display_name] = category + + # Add belongs to category relations to create later + if category.belongs_to_category: + not_imported_belongs_to_relations.add( + (category.display_name, category.belongs_to_category) + ) + + return ( + list(self._category_cache.values()), + not_imported_belongs_to_relations, + category_import_errors, + ) + + def _import_glossary_terms( self, terms: dict[int, bg_term.Term] ) -> tuple[ - list[bg_term.Term], - list[_CreatedRelationship[RelationshipType]], - list[error.TermImportError] - ]: + list[bg_term.Term], + list[import_types._CreatedRelationship[relation_type.RelationshipType]], + list[error.EntryImportError], + ]: """Imports terms into Data Catalog Business Glossary. Args: terms: List of terms to add to the glossary. Returns: - A list of successfully imported terms, and a list of import errors. + A tuple containing: + * a list of successfully imported terms + * a list of imported relations, + * a list of import errors """ - term_import_errors: list[error.TermImportError] = [] + term_import_errors: list[error.EntryImportError] = [] imported_relations = [] synonym_relations = set() related_term_relations = set() tagged_asset_relations = set() + belongs_to_relations = set() # Create term entries ret = self._create_glossary_terms(terms) @@ -416,7 +709,8 @@ def import_glossary( err = response['error_msg'] if err: - new_error = error.TermImportError( + new_error = error.EntryImportError( + entry_type_lib.EntryType.TERM, line_num, [term.display_name], message=err, @@ -450,15 +744,44 @@ def import_glossary( # once. tagged_asset_relations.add((src, term.display_name)) + # Add belongs to category relations to create + if term.belongs_to_category: + belongs_to_relations.add( + (term.display_name, term.belongs_to_category) + ) + tasks = [ - (synonym_relations, RelationshipType.SYNONYMOUS), - (related_term_relations, RelationshipType.RELATED), - (tagged_asset_relations, RelationshipType.DESCRIBED), + ( + entry_type_lib.EntryType.TERM, + entry_type_lib.EntryType.TERM, + synonym_relations, + relation_type.RelationshipType.SYNONYMOUS, + ), + ( + entry_type_lib.EntryType.TERM, + entry_type_lib.EntryType.TERM, + related_term_relations, + relation_type.RelationshipType.RELATED, + ), + ( + entry_type_lib.EntryType.TERM, + entry_type_lib.EntryType.TERM, + tagged_asset_relations, + relation_type.RelationshipType.DESCRIBED, + ), + ( + entry_type_lib.EntryType.TERM, + entry_type_lib.EntryType.CATEGORY, + belongs_to_relations, + relation_type.RelationshipType.BELONGS_TO, + ), ] - for relations, rel_type in tasks: + for src_type, dst_type, relations, rel_type in tasks: created_relationships, errors = self._create_relationships( - relations, - rel_type + src_type=src_type, + dst_type=dst_type, + related_entries=relations, + relationship_type=rel_type, ) imported_relations.extend(created_relationships) term_import_errors.extend(errors) @@ -469,24 +792,24 @@ def import_glossary( term_import_errors, ) - def _remove_glossary_term(self, term_id: str) -> dict[str, Any]: - """Remove term in target glossary. + def _remove_glossary_entry(self, entry_id: str) -> dict[str, Any]: + """Remove entry in target glossary. Args: - term_id: Term id in the target glossary. + entry_id: Entry id in the target glossary. Returns: Dictionary with response and response code. """ - endpoint = f'{self._glossary_endpoint}/entries/{term_id}' + endpoint = f'{self._glossary_endpoint}/entries/{entry_id}' return api_call_utils.fetch_api_response( requests.delete, endpoint, self._config.project_id ) def clear_glossary(self) -> bool: - """Remove existing terms from a Data Catalog Business Glossary. + """Remove existing terms and categories from a Data Catalog Business Glossary. Args: None. @@ -494,22 +817,27 @@ def clear_glossary(self) -> bool: Returns: A boolean indicating if the operation succeeded. """ - logger.info('Clearing the existing terms in the target glossary.') + logger.info( + 'Clearing the existing terms and categories in the target glossary.' + ) tasks = [] for term in self._term_cache.values(): tasks.append((term.term_id,)) + for category in self._category_cache.values(): + tasks.append((category.category_id,)) - ret = Glossary._parallelize(self._remove_glossary_term, tasks) + ret = Glossary._parallelize(self._remove_glossary_entry, tasks) for response in ret: err = response['error_msg'] if err: logger.error( - 'Could not delete term from the target glossary.' + 'Could not delete entry (term or catgory) from the target glossary.' ) return False # Refresh term cache self._term_cache = {} + self._category_cache = {} return True diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_mode.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_mode.py new file mode 100644 index 0000000..cc5d039 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_mode.py @@ -0,0 +1,10 @@ +"""Enum for import mode.""" +import enum + + +@enum.unique +class ImportMode(enum.Enum): + """Import mode {strict, clear}.""" + + STRICT = "strict" + CLEAR = "clear" diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_types.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_types.py new file mode 100644 index 0000000..9214f4f --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/import_types.py @@ -0,0 +1,21 @@ +"""Types for imported data.""" + +from typing import TypeVar + +import category as bg_category +import entry_type as EntryType +import error +import relation_type +import term as bg_term + +_T = TypeVar('_T') +_CreatedRelationship = tuple[str, str, _T] + +_ImportResult = tuple[ + dict[EntryType.EntryType, list[bg_term.Term | bg_category.Category]], + dict[ + EntryType.EntryType, + list[_CreatedRelationship[relation_type.RelationshipType]], + ], + list[error.EntryImportError], +] diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parse_utils.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parse_utils.py index d4fca8f..36faebb 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parse_utils.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parse_utils.py @@ -1,8 +1,9 @@ -"""Utility functions to parse each type of field in csv_parser.py.""" +"""Utility functions to parse each type of field in terms_csv_parser.py.""" import re from typing import TypeVar +import entry_type as entry_type_lib import error @@ -11,7 +12,17 @@ _ParseResult = tuple[_T, _ParseErrors] -def parse_str(s: str) -> _ParseResult[str | None]: +def parse_category_str(s: str) -> _ParseResult[str | None]: + return _parse_str(entry_type_lib.EntryType.CATEGORY, s) + + +def parse_term_str(s: str) -> _ParseResult[str | None]: + return _parse_str(entry_type_lib.EntryType.TERM, s) + + +def _parse_str( + entry_type: entry_type_lib.EntryType, s: str +) -> _ParseResult[str | None]: """Parses a single string. The parsed string might optionally be enclosed between double @@ -19,6 +30,7 @@ def parse_str(s: str) -> _ParseResult[str | None]: in the string. Args: + entry_type: enum indicating parsed entry type (e.g. CATEGORY or TERM). s: input string. Returns: @@ -27,14 +39,25 @@ def parse_str(s: str) -> _ParseResult[str | None]: """ match = re.fullmatch(r'"[^"]*"|[^*]*', s) if match is None: - return None, [error.ParseError(f"Error parsing field {s}")] + return None, [error.ParseError(entry_type, f"Error parsing field {s}")] return match.group(0).strip('"').strip(), [] -def parse_data_stewards(s: str) -> _ParseResult[list[str]]: +def parse_category_data_stewards(s: str) -> _ParseResult[list[str]]: + return _parse_data_stewards(entry_type_lib.EntryType.CATEGORY, s) + + +def parse_term_data_stewards(s: str) -> _ParseResult[list[str]]: + return _parse_data_stewards(entry_type_lib.EntryType.TERM, s) + + +def _parse_data_stewards( + entry_type: entry_type_lib.EntryType, s: str +) -> _ParseResult[list[str]]: """Parses list of data stewards. Args: + entry_type: enum indicating parsed entry type (e.g. CATEGORY or TERM). s: A string to parse. Returns: @@ -46,16 +69,18 @@ def parse_data_stewards(s: str) -> _ParseResult[list[str]]: for steward in unfiltered: if not steward: continue - data_steward = parse_data_steward(steward) + data_steward = _parse_data_steward(steward) if data_steward: data_stewards.append(data_steward) else: - errors.append(error.ParseError(f"Error parsing data steward {steward}")) + errors.append( + error.ParseError(entry_type, f"Error parsing data steward {steward}") + ) return data_stewards, errors -def parse_data_steward(s: str) -> str | None: +def _parse_data_steward(s: str) -> str | None: """Parses a single data steward. Data stewards follows the pattern "Name ", where the name is optional. diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parser_types.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parser_types.py new file mode 100644 index 0000000..2011c5a --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/parser_types.py @@ -0,0 +1,16 @@ +"""Provides return type for csv parsers.""" +from typing import Callable, TypeVar + +import category as bg_category +import error +import term as bg_term + +_ParseErrors = list[error.ParseError] +_ParserOutput = ( + dict[int, bg_category.Category] | dict[int, bg_term.Term] +) # Dictionary mapping lines to categories | terms +_T = TypeVar("_T") +_ParseResult = tuple[_T, _ParseErrors] +_ParseFn = Callable[[str], _ParseResult[_T]] +_LinesRead = int +_ParserReturnType = tuple[_ParserOutput, _ParseErrors, _LinesRead] diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/relation_type.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/relation_type.py new file mode 100644 index 0000000..141e50b --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/relation_type.py @@ -0,0 +1,12 @@ +"""Enum for entries relationship type.""" + +import enum + + +class RelationshipType(enum.Enum): + """Enum containing the types of relationships between terms.""" + + SYNONYMOUS = 'is_synonymous_to' + RELATED = 'is_related_to' + DESCRIBED = 'is_described_by' + BELONGS_TO = 'belongs_to' diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/term.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/term.py index 3692be1..e6b1e24 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/term.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/term.py @@ -2,8 +2,14 @@ A term represents an entry in a business glossary. Terms can describe assets, have other related terms, and have synonym terms. + +Typical usage example: + stewards = ["John Doe", "Lee"] + term = Term("Cost", "Total cast of the items in the purchase.", + data_stewards=stewards) """ +from __future__ import annotations import random import re import string @@ -17,14 +23,16 @@ class Term: display_name: A string indicating the display name for the term. description: A string containing a rich-text description of the term, encoded as plain text. - data_stewards: A list of data stewards for this term. + data_stewards: A list of strings representing data stewards for this term. tagged_assets: A list of names for entries that are described by this term. synonyms: A list of display_name for terms that have a synonym relationship with this term. related_terms: A list of display_name for terms that have a related_to relationship with this term. - term_id: A string containing a unique identifier for the term in DC + belongs_to_category: A string indicating the display name of a category + to which this term belongs to + term_id: A string containing a unique identifier for the term in DC. """ def __init__( @@ -35,21 +43,23 @@ def __init__( tagged_assets: list[str] | None = None, synonyms: list[str] | None = None, related_terms: list[str] | None = None, + belongs_to_category: str | None = None, force_term_id: str | None = None ): self.display_name = display_name self.description = description - self.data_stewards = data_stewards or [] - self.tagged_assets = tagged_assets or [] - self.synonyms = synonyms or [] - self.related_terms = related_terms or [] + self.data_stewards = [] if data_stewards is None else data_stewards + self.tagged_assets = [] if tagged_assets is None else tagged_assets + self.synonyms = [] if synonyms is None else synonyms + self.related_terms = [] if related_terms is None else related_terms + self.belongs_to_category = belongs_to_category self.term_id = force_term_id or self._generate_term_id() def __repr__(self): return ( f"Term [{self.display_name} : {self.description} :" f" {self.data_stewards} : {self.tagged_assets} : {self.synonyms} :" - f" {self.related_terms}]" + f" {self.related_terms} : {self.belongs_to_category}]" ) def _generate_term_id(self): @@ -57,14 +67,14 @@ def _generate_term_id(self): if not self.display_name: return "" infix = re.sub(r"[^a-zA-Z0-9_]", "_", self.display_name).lower() - prefix = "_" if infix[0] >= "0" and infix[0] <= "9" else "" + prefix = "_" if infix[0].isdigit() else "" suffix = "".join( random.choices(string.ascii_lowercase + string.digits, k=7) ) return f"{prefix}{infix}{suffix}" @classmethod - def from_json(cls, entry: dict[str, Any]) -> ...: + def from_dict(cls, entry: dict[str, Any]) -> Term | None: """Creates a term instance from a term entry in DataCatalog. Args: @@ -83,7 +93,7 @@ def _get_term_id_from_resource_path(resource: str) -> str: uid = _get_term_id_from_resource_path(entry["name"]) display_name = entry["displayName"] description = entry["coreAspects"]["business_context"]["jsonContent"][ - "description" + "description" ] except KeyError: return None diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/csv_parser.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py similarity index 82% rename from dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/csv_parser.py rename to dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py index 6bc9a9d..60f4596 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/csv_parser.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py @@ -1,40 +1,36 @@ """Provides functionality of reading business glossary terms from a CSV file. Typical usage example: - terms, errors, lines = read_glossary_csv(glossary_path) + terms, errors, lines_read = read_glossary_csv() """ import csv import dataclasses -from typing import Any, Callable, TypeVar +from typing import Any +import entry_type import error import parse_utils +import parser_types import term as bg_term -_Terms = list[Any] -_ParseErrors = list[error.ParseError] -_ParserOutput = dict[int, bg_term.Term] # Dictionary mapping lines to terms -_T = TypeVar("_T") -_ParseResult = tuple[_T, _ParseErrors] -_ParseFn = Callable[[str], _ParseResult[_T]] -_LinesRead = int - """Each attribute parser is represented as a tuple consisting of: field_name: Name of the field to parse. parser_function: Pointer to a parsing function for the field. is_optional_field: Boolean representing if the field is optional. """ -_ATTRIBUTE_PARSERS: list[tuple[str, _ParseFn[Any], bool]] = [ - ("display_name", parse_utils.parse_str, False), - ("description", parse_utils.parse_str, False), - ("data_stewards", parse_utils.parse_data_stewards, True), +_ATTRIBUTE_PARSERS: list[tuple[str, parser_types._ParseFn[Any], bool]] = [ + ("display_name", parse_utils.parse_term_str, False), + ("description", parse_utils.parse_term_str, False), + ("data_stewards", parse_utils.parse_term_data_stewards, True), ("tagged_assets", parse_utils.parse_list, True), ("synonyms", parse_utils.parse_list, True), ("relations", parse_utils.parse_list, True), + ("belongs_to_category", parse_utils.parse_term_str, True), ] + _MAX_DISPLAY_NAME_LENGTH = 200 _NON_ALLOWED_DISPLAY_NAME_CHARACTERS = ("\n",) @@ -51,15 +47,15 @@ def __iter__(self): def parse_glossary_csv( path: str, -) -> tuple[_ParserOutput, _ParseErrors, _LinesRead]: +) -> parser_types._ParserReturnType: """Reads CSV file containing business glossary terms. Args: path: Path of a CSV file to read. Returns: - A tuple of list of successfully parsed terms, a list of errors and the - number of lines we read in the CSV. + _ParserReturnType - a tuple of list of successfully parsed terms, + a list of errors and the number of lines we read in the CSV. """ terms = {} @@ -84,14 +80,18 @@ def parse_glossary_csv( terms[line_idx + 1] = term lines_read += 1 except FileNotFoundError: - errors.append(error.ParseError(message=f"{path} could not be found.")) + errors.append( + error.ParseError( + entry_type.EntryType.TERM, message=f"{path} could not be found." + ) + ) return terms, errors, lines_read def _validate_term( term: bg_term.Term, tracked_terms: set[str] -) -> _ParseErrors: +) -> parser_types._ParseErrors: """Validates a business glossary term. Performs the following tests: @@ -111,14 +111,16 @@ def _validate_term( # If the term display name is empty we record an error if not term.display_name: err = error.ParseError( + entry_type.EntryType.TERM, message="The display name for the term is empty.", - column=1 + column=1, ) errors.append(err) # If the term description is empty we record an error if not term.description: err = error.ParseError( + entry_type.EntryType.TERM, message="The description for the term is empty.", column=2, ) @@ -128,6 +130,7 @@ def _validate_term( # If the term has appeared before in the CSV we record an error. if term.display_name.lower() in tracked_terms: err = error.ParseError( + entry_type.EntryType.TERM, message="The term is duplicated in the CSV.", column=1, resources=[term.display_name], @@ -136,6 +139,7 @@ def _validate_term( if len(term.display_name) > _MAX_DISPLAY_NAME_LENGTH: err = error.ParseError( + entry_type.EntryType.TERM, message="The term's display name is too big.", column=1, resources=[term.display_name], @@ -145,6 +149,7 @@ def _validate_term( for character in _NON_ALLOWED_DISPLAY_NAME_CHARACTERS: if character in term.display_name: err = error.ParseError( + entry_type.EntryType.TERM, message="Unallowed character in display name.", column=1, resources=[term.display_name], @@ -156,7 +161,7 @@ def _validate_term( def parse_term( line_idx: int, record: list[str], tracked_terms: set[str] -) -> _ParseResult[bg_term.Term]: +) -> parser_types._ParseResult[bg_term.Term]: """Parses a business glossary term. Args: @@ -180,6 +185,7 @@ def parse_term( # If the field is not mandatory we can skip creating a ParseError if not is_optional_field: err = error.ParseError( + entry_type.EntryType.TERM, message="Missing field", line=line_idx + 1, column=i + 1, @@ -204,7 +210,9 @@ def parse_term( data_stewards, tagged_assets, synonyms, - related_terms, *_ + related_terms, + belongs_to_category, + *_, ) = attributes term = bg_term.Term( @@ -214,6 +222,7 @@ def parse_term( tagged_assets, synonyms, related_terms, + belongs_to_category, ) validation_errors = _validate_term(term, tracked_terms) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py new file mode 100644 index 0000000..57030c2 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py @@ -0,0 +1,156 @@ +import unittest +from unittest import mock + +from parameterized import parameterized +import entry_type +import categories_csv_parser as csv_parser + + +class CategoriesCsvReaderTest(unittest.TestCase): + + @parameterized.expand([ + ("category,description,name,parent_category"), + (""""category","description","name",parent_category"""), + ]) + def test_read_glossary_csv_single_line(self, content): + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + categories, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 0) + self.assertEqual(categories[1].display_name, "category") + self.assertEqual(categories[1].description, "description") + self.assertEqual(categories[1].data_stewards, ["name"]) + self.assertEqual(categories[1].belongs_to_category, "parent_category") + self.assertEqual(lines_read, 1) + + def test_read_glossary_csv_multi_line(self): + content = ( + '"category1","description1","","parent_category_1"\n' + "category2,description2,,parent_category_2\n" + 'category3,description3,"name3,name33",parent_category_3' + ) + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + categories, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 0) + self.assertEqual(categories[1].display_name, "category1") + self.assertEqual(categories[2].display_name, "category2") + self.assertEqual(categories[3].display_name, "category3") + self.assertEqual(categories[1].description, "description1") + self.assertEqual(categories[2].description, "description2") + self.assertEqual(categories[3].description, "description3") + self.assertEqual(categories[1].data_stewards, [""]) + self.assertEqual(len(categories[2].data_stewards), 0) + self.assertEqual( + categories[3].data_stewards, + ["name3", "name33"], + ) + self.assertEqual(categories[1].belongs_to_category, "parent_category_1") + self.assertEqual(categories[2].belongs_to_category, "parent_category_2") + self.assertEqual(categories[3].belongs_to_category, "parent_category_3") + self.assertEqual(lines_read, 3) + + def test_read_glossary_csv_errors(self): + content = ( + "category1,description1,,\n" + "category_with_invalid_data_steward_format,description2,data" + " steward,\n" + "category_with_missing_fields,,\n" + "category2,description2,,parent_category" + ) + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + categories, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 2) + # Steward is invalid + self.assertEqual(errors[0].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[0].line, 2) + self.assertEqual(errors[0].column, 3) + # Description is missing + self.assertEqual(errors[1].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[1].line, 3) + self.assertEqual(errors[1].column, 2) + self.assertEqual(categories[1].display_name, "category1") + self.assertEqual(categories[4].display_name, "category2") + self.assertEqual(categories[1].description, "description1") + self.assertEqual(categories[4].description, "description2") + self.assertEqual(categories[1].data_stewards, [""]) + self.assertEqual(categories[4].data_stewards, [""]) + self.assertEqual(categories[4].belongs_to_category, "parent_category") + self.assertEqual(lines_read, 4) + + def test_read_glossary_csv_empty_lines(self): + content = "\n\n\n" + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + categories, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 0) + self.assertEqual(len(categories), 0) + self.assertEqual(lines_read, 0) + + def test_read_glossary_csv_duplicate_errors(self): + content = """category 1,description1,,parent_category +Category 1,description2,,""" + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[0].line, 2) + self.assertEqual(errors[0].column, 1) + self.assertEqual( + errors[0].message, "The category is duplicated in the CSV." + ) + self.assertEqual(lines_read, 2) + + def test_read_glossary_csv_empty_display_name(self): + content = """ ,description1,, +\" \",description2,,""" + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 2) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[0].line, 1) + self.assertEqual(errors[0].column, 1) + self.assertEqual( + errors[0].message, "The display name for the category is empty." + ) + self.assertEqual(errors[1].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[1].line, 2) + self.assertEqual(errors[1].column, 1) + self.assertEqual( + errors[1].message, "The display name for the category is empty." + ) + self.assertEqual(lines_read, 2) + + def test_read_glossary_csv_large_display_name(self): + name = "Test" * 51 + content = f"""{name},description1,,,,""" + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[0].line, 1) + self.assertEqual(errors[0].column, 1) + self.assertEqual( + errors[0].message, "The category's display name is too big." + ) + self.assertEqual(lines_read, 1) + + def test_read_glossary_csv_non_allowed_character(self): + content = """\"display\n name\",description1,,,,""" + + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = csv_parser.parse_glossary_csv("") + self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.CATEGORY) + self.assertEqual(errors[0].line, 1) + self.assertEqual(errors[0].column, 1) + self.assertEqual( + errors[0].message, "Unallowed character in display name." + ) + self.assertEqual(lines_read, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/category_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/category_test.py new file mode 100644 index 0000000..2de1a97 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/category_test.py @@ -0,0 +1,83 @@ +import unittest + +import category as bg_category + + +class CategoryTest(unittest.TestCase): + + def test_create_default_arguments(self): + category = bg_category.Category("test-category-1", "description") + self.assertEqual(category.display_name, "test-category-1") + self.assertEqual(category.description, "description") + self.assertEqual(len(category.data_stewards), 0) + + def test_create_with_stewards(self): + data_steward_1 = "steward-name-1" + data_steward_2 = "" + category = bg_category.Category( + "test-category-1", + "description", + data_stewards=[data_steward_1, data_steward_2], + ) + self.assertEqual(len(category.data_stewards), 2) + + def test_create_with_invalid_parameters(self): + """Should raise a TypeError exception due to the missing description.""" + with self.assertRaises(TypeError): + bg_category.Category("test-category-1") + + def test_generates_id_based_on_display_name(self): + category1 = bg_category.Category("category", "d1") + self.assertEqual(category1.category_id[:8], "category") + category2 = bg_category.Category("category CATEGORY", "d2") + self.assertEqual(category2.category_id[:17], "category_category") + category3 = bg_category.Category("123 CATEGORY", "d3") + self.assertEqual(category3.category_id[:13], "_123_category") + + def test_create_with_belongs_to_category(self): + category = bg_category.Category( + "test-category-1", + "description", + [], + belongs_to_category="law_protected_data", + ) + self.assertEqual(len(category.data_stewards), 0) + self.assertEqual(category.belongs_to_category, "law_protected_data") + + def test_mutable_fields_not_shared(self): + category1 = bg_category.Category("test-category-1", "description") + category2 = bg_category.Category("test-category-2", "description") + category1.data_stewards.append("steward-1@example.com") + self.assertEqual(len(category1.data_stewards), 1) + self.assertEqual(len(category2.data_stewards), 0) + + def test_create_from_json(self): + category_entry = { + "name": "projects/123/locations/us/entryGroups/glossary_with_categories/entries/pii_data_xyz", + "displayName": "PII Data", + "coreAspects": { + "business_context": { + "jsonContent": { + "description": "Personally Identifiable Information Data" + } + } + }, + } + expected_category = bg_category.Category( + "PII Data", + "Personally Identifiable Information Data", + force_category_id="pii_data_xyz", + ) + + actual_category = bg_category.Category.from_dict(category_entry) + self.assertEqual( + actual_category.display_name, expected_category.display_name + ) + self.assertEqual(actual_category.description, expected_category.description) + self.assertEqual( + actual_category.category_id, expected_category.category_id + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/error_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/error_test.py index 595f7eb..ac9c82e 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/error_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/error_test.py @@ -1,50 +1,54 @@ import unittest +import entry_type import error - -class ImportErrorTest(unittest.TestCase): +class EntryImportErrorTest(unittest.TestCase): def test_to_string_default(self): - err = error.TermImportError( + err = error.EntryImportError( + entry_type.EntryType.TERM, line=1, resources=["Resource 1", "Resource 2"], message=None, operation=None, ) expected_message = ( - "Line 1 : On resource(s) [Resource 1, Resource 2]." + "TERM Line 1 : On resource(s) [Resource 1, Resource 2]." ) self.assertEqual(err.to_string(), expected_message) def test_to_string_operation(self): - err = error.TermImportError( + err = error.EntryImportError( + entry_type.EntryType.TERM, line=1, resources=["Resource 1"], message=None, operation="add_term", ) expected_message = ( - "Line 1 : Performing add_term on resource(s) [Resource 1]." + "TERM Line 1 : Performing add_term on resource(s) [Resource 1]." ) self.assertEqual(err.to_string(), expected_message) def test_to_string_message(self): - err = error.TermImportError( + err = error.EntryImportError( + entry_type.EntryType.TERM, line=1, resources=["Resource 1"], message="Removing this resource will fix the error.", operation=None, ) expected_message = ( - "Line 1 : On resource(s) [Resource 1]." + "TERM Line 1 : On resource(s) [Resource 1]." " Removing this resource will fix the error." ) self.assertEqual(err.to_string(), expected_message) def test_no_resources(self): with self.assertRaises(AssertionError): - error.TermImportError( + error.EntryImportError( + entry_type.EntryType.TERM, line=1, resources=[] ) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py index f17c2f2..649ed0a 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py @@ -1,492 +1,1066 @@ import unittest from unittest import mock +import category as bg_category +import entry_type as EntryType +import error import glossary as dc_glossary import glossary_identification +import relation_type import term as bg_term +import user_report +import utils from tests.test_utils import mocks class GlossaryTest(unittest.TestCase): + def setUp(self): + super().setUp() + self.enterContext( + mock.patch( + "api_call_utils.requests.get", + side_effect=mocks.mocked_get_api_response, + ) + ) + self.post_mock = self.enterContext(mock.patch("requests.post")) + def test_glossary_uid_returned(self): - with mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response): - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_no_terms", - "empty_glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - self.assertEqual( - glossary._glossary_uid, - "71372af7-bb1a-4020-aba8-223c57c366d2" - ) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_no_terms", "empty_glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + self.assertEqual( + glossary._glossary_uid, "71372af7-bb1a-4020-aba8-223c57c366d2" + ) def test_glossary_not_found(self): - with mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response): - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_no_glossary", - "glossary_not_found" - ) - with self.assertRaises(ValueError): - dc_glossary.Glossary(glossary_id) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_no_glossary", "glossary_not_found" + ) + with self.assertRaises(ValueError): + dc_glossary.Glossary(glossary_id) + + def test_glossary_print_report_and_exit_on_categories_import_error(self): + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + category1 = bg_category.Category("Category1", "Desc Category1") + term1 = bg_term.Term("Term1", "Desc Term1") + glossary._category_cache = {category1.display_name: category1} + glossary._term_cache = {term1.display_name: term1} + + expected_category_import_error = error.EntryImportError( + EntryType.EntryType.CATEGORY, + 1, + [category1.display_name], + "Some error message", + ) + expected_imported_categories = [] + expected_not_imported_belongs_to_relations = [] + expected_categories_import_errors = [expected_category_import_error] + expected_import_glossary_categories_ret = ( + expected_imported_categories, + expected_not_imported_belongs_to_relations, + expected_categories_import_errors, + ) + expected_imported_terms = [] + expected_imported_relations_term_to_term = set() + expected_terms_import_errors = [] + expected_import_glossary_terms_ret = ( + expected_imported_terms, + expected_imported_relations_term_to_term, + expected_terms_import_errors, + ) + mock_import_glossary_categories = self.enterContext( + mock.patch.object( + dc_glossary.Glossary, + "_import_glossary_categories", + return_value=expected_import_glossary_categories_ret, + ) + ) + self.enterContext( + mock.patch.object( + dc_glossary.Glossary, + "_import_glossary_terms", + return_value=expected_import_glossary_terms_ret, + ) + ) + mock_print_report_for_erronous_categories_import = self.enterContext( + mock.patch.object( + user_report, "print_report_for_erronous_categories_import" + ) + ) + mock_end_program_execution = self.enterContext( + mock.patch.object(utils, "end_program_execution") + ) + + categories = {1: category1} + terms = {1: term1} + glossary.import_glossary(terms, categories) + + mock_import_glossary_categories.assert_called_once_with(categories) + mock_print_report_for_erronous_categories_import.assert_called_once_with( + expected_imported_categories, expected_categories_import_errors + ) + mock_end_program_execution.assert_called_once() + + def test_glossary_clear_terms_and_categories(self): + mock_paralellize = self.enterContext( + mock.patch.object(dc_glossary.Glossary, "_parallelize", return_valu=[]) + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + category1 = bg_category.Category( + "Category1", "Desc Category1", force_category_id="Category1_xyz" + ) + category2 = bg_category.Category( + "Category2", "Desc Category2", force_category_id="Category2_xyz" + ) + term1 = bg_term.Term("Term 1", "Desc Term1", force_term_id="Term1_xyz") + term2 = bg_term.Term("Term 1", "Desc Term1", force_term_id="Term2_xyz") + glossary._category_cache = { + "Category1": category1, + "Category2": category2, + } + glossary._term_cache = { + "Term1": term1, + "Term2": term2, + } + expected_tasks = [ + ("Term1_xyz",), + ("Term2_xyz",), + ("Category1_xyz",), + ("Category2_xyz",), + ] + + glossary.clear_glossary() + + mock_paralellize.assert_called_once_with( + glossary._remove_glossary_entry, expected_tasks + ) + + def test_glossary_remove_entry(self): + delete_mock = self.enterContext(mock.patch("requests.delete")) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + category1 = bg_category.Category( + "Category1", "Desc Category1", force_category_id="Category1_xyz" + ) + glossary._category_cache = { + "Category1": category1, + } + glossary._remove_glossary_entry(category1.category_id) + + delete_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories_and_terms/" + f"entries/{category1.category_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json=None, + ) def test_glossary_is_not_empty(self): - with mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response): - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - self.assertFalse(glossary.is_glossary_empty()) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + self.assertFalse(glossary.is_glossary_empty()) + + def test_glossary_with_terms_is_not_empty(self): + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + self.assertFalse(glossary.is_glossary_empty()) + + def test_glossary_with_categories_is_not_empty(self): + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_categories", "glossary_not_empty" + ) + glossary = dc_glossary.Glossary(glossary_id) + self.assertFalse(glossary.is_glossary_empty()) def test_glossary_is_empty(self): - with mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, - ): - glossary_id = glossary_identification.GlossaryId( - "123", "us", "test_entry_group_with_no_terms", "empty_glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - is_empty = glossary.is_glossary_empty() - self.assertTrue(is_empty) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_no_terms", "empty_glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + is_empty = glossary.is_glossary_empty() + self.assertTrue(is_empty) def test_glossary_term_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term("Term1", "Desc1") + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(term) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + f"entries?entry_id={term.term_id}" ), - mock.patch("requests.post") as post_mock, - ): - term = bg_term.Term("Term1", "Desc1") - glossary_id = glossary_identification.GlossaryId( - "123", "us", "test_entry_group_with_terms", "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._create_glossary_term(term) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - f"entries?entry_id={term.term_id}" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "entry_type": "glossary_term", - "display_name": "Term1", - "core_aspects": { - "business_context": { - "aspect_type": "business_context", - "json_content": {"description": "Desc1", "contacts": []}, - } - }, - "core_relationships": { - "relationship_type": "is_child_of", - "destination_entry_name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists" - ), - }, - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_term", + "display_name": "Term1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": {"description": "Desc1", "contacts": []}, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists" + ), + }, + }, + ) def test_glossary_term_with_one_steward_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term("Term1", "Desc1", ["name "]) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(term) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/" + "locations/us/entryGroups/test_entry_group_with_terms/" + f"entries?entry_id={term.term_id}" ), - mock.patch("requests.post") as post_mock, - ): - term = bg_term.Term( - "Term1", - "Desc1", - ["name "] - ) - glossary_id = glossary_identification.GlossaryId( - "123", "us", "test_entry_group_with_terms", "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._create_glossary_term(term) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/" - "locations/us/entryGroups/test_entry_group_with_terms/" - f"entries?entry_id={term.term_id}" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "entry_type": "glossary_term", - "display_name": "Term1", - "core_aspects": { - "business_context": { - "aspect_type": "business_context", - "json_content": { - "description": "Desc1", - "contacts": ["name "] - }, - } - }, - "core_relationships": { - "relationship_type": "is_child_of", - "destination_entry_name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists" - ), - }, - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_term", + "display_name": "Term1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": { + "description": "Desc1", + "contacts": ["name "], + }, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists" + ), + }, + }, + ) def test_glossary_term_with_many_stewards_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term( + "Term1", + "Desc1", + ["name ", "name2 "], + ) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(term) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/" + "locations/us/entryGroups/test_entry_group_with_terms/" + f"entries?entry_id={term.term_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_term", + "display_name": "Term1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": { + "description": "Desc1", + "contacts": ["name ", "name2 "], + }, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists" + ), + }, + }, + ) + + def test_glossary_category_is_created(self): + category = bg_category.Category("Category1", "Desc1") + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_categories", "glossary_not_empty" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(category) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories/" + f"entries?entry_id={category.category_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_category", + "display_name": "Category1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": {"description": "Desc1", "contacts": []}, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories/" + "entries/glossary_not_empty" + ), + }, + }, + ) + + def test_glossary_category_with_one_steward_is_created(self): + category = bg_category.Category("Category1", "Desc1", ["name "]) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_categories", "glossary_not_empty" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(category) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/" + "locations/us/entryGroups/test_entry_group_with_categories/" + f"entries?entry_id={category.category_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_category", + "display_name": "Category1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": { + "description": "Desc1", + "contacts": ["name "], + }, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories/" + "entries/glossary_not_empty" + ), + }, + }, + ) + + def test_glossary_category_with_many_stewards_is_created(self): + category = bg_category.Category( + "Category1", + "Desc1", + ["name ", "name2 "], + ) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_categories", "glossary_not_empty" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._create_glossary_entry(category) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/" + "locations/us/entryGroups/test_entry_group_with_categories/" + f"entries?entry_id={category.category_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_category", + "display_name": "Category1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": { + "description": "Desc1", + "contacts": ["name ", "name2 "], + }, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories/" + "entries/glossary_not_empty" + ), + }, + }, + ) + + def test_glossary_realation_belongs_to_is_created(self): + term = bg_term.Term("Term 1", "Desc Term", belongs_to_category="Category 1") + category = bg_category.Category("Category 1", "Desc Category") + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + glossary._category_cache = {category.display_name: category} + glossary._create_relationship( + term.display_name, + EntryType.EntryType.TERM, + term.belongs_to_category, + EntryType.EntryType.CATEGORY, + relation_type.RelationshipType.BELONGS_TO, + ) + dest_entry_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_categories_and_terms/" + f"entries/{category.category_id}" + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories_and_terms/" + f"entries/{term.term_id}/relationships" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": ( + relation_type.RelationshipType.BELONGS_TO.value + ), + "destination_entry_name": dest_entry_name, + }, + ) + + def test_glossary_relation_asset_is_described_by_term_is_created_even_though_asset_entry_not_in_internal_cache( + self, + ): + term = bg_term.Term("Term 1", "Desc Term", force_term_id="term_id") + asset = "projects/asset_project/locations/us-central1/entryGroups/asset-group/entries/bg_fileset:field1" + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + glossary._category_cache = {} + glossary._create_relationship( + asset, + EntryType.EntryType.TERM, + term.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.DESCRIBED, + ) + dest_entry_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_categories_and_terms/" + f"entries/{term.term_id}" + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/asset_project/" + "locations/us-central1/entryGroups/asset-group/" + "entries/bg_fileset/relationships" ), - mock.patch("requests.post") as post_mock, - ): - term = bg_term.Term( - "Term1", - "Desc1", - ["name ", "name2 "], - ) - glossary_id = glossary_identification.GlossaryId( - "123", "us", "test_entry_group_with_terms", "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._create_glossary_term(term) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/" - "locations/us/entryGroups/test_entry_group_with_terms/" - f"entries?entry_id={term.term_id}" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "entry_type": "glossary_term", - "display_name": "Term1", - "core_aspects": { - "business_context": { - "aspect_type": "business_context", - "json_content": { - "description": "Desc1", - "contacts": ["name ", "name2 "] - }, - } - }, - "core_relationships": { - "relationship_type": "is_child_of", - "destination_entry_name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists" - ), - }, - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": relation_type.RelationshipType.DESCRIBED.value, + "destination_entry_name": dest_entry_name, + "source_column": "field1", + }, + ) + + def test_glossary_invalidate_relation_described_when_src_entry_type_is_other_than_term( + self, + ): + term1 = bg_term.Term("Term 1", "Desc1") + category1 = bg_category.Category("Category 1", "Desc1") + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._category_cache = { + category1.display_name: category1, + } + glossary._term_cache = {term1.display_name: term1} + + err = glossary._create_relationship( + category1.display_name, + EntryType.EntryType.CATEGORY, + term1.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.DESCRIBED, + ) + + self.assertIsNotNone(err) + self.assertEqual( + err.resources, [category1.display_name, term1.display_name] + ) + self.assertEqual( + err.operation, + f"create_{relation_type.RelationshipType.DESCRIBED.value}_relationship_validation", + ) + + # We should not call post when is_described_by source is not a term + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_described_when_dst_entry_type_is_other_than_term( + self, + ): + term1 = bg_term.Term("Term 1", "Desc1") + category1 = bg_category.Category("Category 1", "Desc1") + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._category_cache = { + category1.display_name: category1, + } + glossary._term_cache = {term1.display_name: term1} + + err = glossary._create_relationship( + term1.display_name, + EntryType.EntryType.TERM, + category1.display_name, + EntryType.EntryType.CATEGORY, + relation_type.RelationshipType.DESCRIBED, + ) + + self.assertIsNotNone(err) + self.assertEqual( + err.resources, [term1.display_name, category1.display_name] + ) + self.assertEqual( + err.operation, + f"create_{relation_type.RelationshipType.DESCRIBED.value}_relationship_validation", + ) + + # We should not call post when is_described_by destination is not a term + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_when_source_entry_is_not_in_cache(self): + term1 = bg_term.Term( + "Term 1", + "Desc1", + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term1.display_name: term1, + } + glossary._create_relationship( + "non_existent_entry", + EntryType.EntryType.TERM, + term1.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.SYNONYMOUS, + ) + # We should not call post when source entry is not in cache + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_when_destination_entry_is_not_in_cache( + self, + ): + non_existent_entry = "non_existent_entry" + term1 = bg_term.Term( + "Term 1", + "Desc1", + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term1.display_name: term1, + } + err = glossary._create_relationship( + term1.display_name, + EntryType.EntryType.TERM, + non_existent_entry, + EntryType.EntryType.TERM, + relation_type.RelationshipType.SYNONYMOUS, + ) + + self.assertIsNotNone(err) + self.assertEqual(err.resources, [term1.display_name, non_existent_entry]) + self.assertEqual( + err.operation, + f"create_{relation_type.RelationshipType.SYNONYMOUS.value}_relationship_validation", + ) + # We should not call post when destination entry is not in cache + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_belongs_to_non_category_destination( + self, + ): + term1 = bg_term.Term( + "Term 1", + "Desc1", + belongs_to_category="Term 2", + ) + term2 = bg_term.Term("Term 2", "Desc2") + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term1.display_name: term1, + term2.display_name: term2, + } + glossary._create_relationship( + term1.display_name, + EntryType.EntryType.TERM, + term1.belongs_to_category, + EntryType.EntryType.TERM, + relation_type.RelationshipType.BELONGS_TO, + ) + # We should not call post when belongs_to destination is not a category + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_category_balongs_to_category_with_same_display_name( + self, + ): + category1 = bg_category.Category( + "Category 1", + "Desc1", + belongs_to_category="Category 1", + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._category_cache = { + category1.display_name: category1, + } + err = glossary._create_relationship( + category1.display_name, + EntryType.EntryType.CATEGORY, + category1.belongs_to_category, + EntryType.EntryType.CATEGORY, + relation_type.RelationshipType.BELONGS_TO, + ) + self.assertIsNotNone(err.message) + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_with_missing_destination(self): + category1 = bg_category.Category( + "Category 1", + "Desc1", + belongs_to_category="Missing Category", + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._category_cache = { + category1.display_name: category1, + } + err = glossary._create_relationship( + category1.display_name, + EntryType.EntryType.CATEGORY, + category1.belongs_to_category, + EntryType.EntryType.CATEGORY, + relation_type.RelationshipType.BELONGS_TO, + ) + self.assertIsNotNone(err.message) + self.post_mock.assert_not_called() + + def test_glossary_invalidate_relation_with_missing_source(self): + term1 = bg_term.Term( + "Term 1", + "Desc1", + synonyms=["Term 2"], + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term1.display_name: term1, + } + err = glossary._create_relationship( + term1.display_name, + EntryType.EntryType.TERM, + "Term 2", + EntryType.EntryType.TERM, + relation_type.RelationshipType.SYNONYMOUS, + ) + self.assertIsNotNone(err.message) + self.post_mock.assert_not_called() + + def test_glossary_term_and_category_with_same_name_allowed(self): + term1 = bg_term.Term( + "PII", + "Term description", + belongs_to_category="PII", + force_term_id="PII_term", + ) + category1 = bg_category.Category( + "PII", + "Category description", + force_category_id="PII_category", + ) + glossary_id = glossary_identification.GlossaryId( + "123", + "us", + "test_entry_group_with_categories_and_terms", + "glossary_not_empty", + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term1.display_name: term1} + glossary._category_cache = {category1.display_name: category1} + glossary._create_relationship( + term1.display_name, + EntryType.EntryType.TERM, + term1.belongs_to_category, + EntryType.EntryType.CATEGORY, + relation_type.RelationshipType.BELONGS_TO, + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories_and_terms/" + f"entries/{term1.term_id}/relationships" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": "belongs_to", + "destination_entry_name": f"projects/123/locations/us/entryGroups/test_entry_group_with_categories_and_terms/entries/{category1.category_id}", + }, + ) def test_glossary_synonym_relation_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term("Term 1", "Desc1", synonyms=["Term 2"]) + term2 = bg_term.Term("Term 2", "Desc2", synonyms=["Term 1"]) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term.display_name: term, + term2.display_name: term2, + } + glossary._create_relationship( + term.display_name, + EntryType.EntryType.TERM, + term.synonyms[0], + EntryType.EntryType.TERM, + relation_type.RelationshipType.SYNONYMOUS, + ) + dest_entry_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" + f"entries/{term2.term_id}" + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + f"entries/{term.term_id}/relationships" ), - mock.patch("requests.post") as post_mock, - ): - term = bg_term.Term("Term 1", "Desc1", synonyms=["Term 2"]) - term2 = bg_term.Term("Term 2", "Desc2", synonyms=["Term 1"]) - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = { - term.display_name: term, - term2.display_name: term2 - } - glossary._create_relationship( - term.display_name, - term.synonyms[0], - dc_glossary.RelationshipType.SYNONYMOUS - ) - dest_entry_name = ( - "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" - f"entries/{term2.term_id}" - ) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - f"entries/{term.term_id}/relationships" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "relationship_type": - dc_glossary.RelationshipType.SYNONYMOUS.value, - "destination_entry_name": dest_entry_name, - }, - ) - - def test_glossary_related_to_relation_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": ( + relation_type.RelationshipType.SYNONYMOUS.value + ), + "destination_entry_name": dest_entry_name, + }, + ) + + def test_glossary_relation_related_to_is_created(self): + term = bg_term.Term("Term 1", "Desc1", related_terms=["Term 2"]) + term2 = bg_term.Term("Term 2", "Desc2", related_terms=["Term 1"]) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = { + term.display_name: term, + term2.display_name: term2, + } + glossary._create_relationship( + term.display_name, + EntryType.EntryType.TERM, + term.related_terms[0], + EntryType.EntryType.TERM, + relation_type.RelationshipType.RELATED, + ) + dest_entry_name = ( + "projects/123/locations/us/" + f"entryGroups/test_entry_group_with_terms/entries/{term2.term_id}" + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + f"entries/{term.term_id}/relationships" ), - mock.patch("requests.post") as post_mock - ): - term = bg_term.Term("Term 1", "Desc1", related_terms=["Term 2"]) - term2 = bg_term.Term("Term 2", "Desc2", related_terms=["Term 1"]) - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = { - term.display_name: term, - term2.display_name: term2 - } - glossary._create_relationship( - term.display_name, - term.related_terms[0], - dc_glossary.RelationshipType.RELATED - ) - dest_entry_name = ( - "projects/123/locations/us/" - f"entryGroups/test_entry_group_with_terms/entries/{term2.term_id}" - ) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - f"entries/{term.term_id}/relationships" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "relationship_type": dc_glossary.RelationshipType.RELATED.value, - "destination_entry_name": dest_entry_name, - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": relation_type.RelationshipType.RELATED.value, + "destination_entry_name": dest_entry_name, + }, + ) def test_glossary_unsuccessful_creation_returns_error(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, - ), + self.post_mock = self.enterContext( mock.patch( "api_call_utils.requests.post", - side_effect=mocks.mocked_post_failed_api_response) as post_mock, - ): - term = bg_term.Term("Term 1", "Desc1", related_terms=["Term 2"]) - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = {term.display_name: term} - ret = glossary._create_glossary_term(term) - self.assertIsNone(ret["json"]) - self.assertIsNotNone(ret["error_msg"]) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - f"entries?entry_id={term.term_id}" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "entry_type": "glossary_term", - "display_name": "Term 1", - "core_aspects": { - "business_context": { - "aspect_type": "business_context", - "json_content": {"description": "Desc1", "contacts": []}, - } - }, - "core_relationships": { - "relationship_type": "is_child_of", - "destination_entry_name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists" - ), - }, - }, - ) + side_effect=mocks.mocked_post_failed_api_response, + ) + ) + term = bg_term.Term("Term 1", "Desc1", related_terms=["Term 2"]) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + ret = glossary._create_glossary_entry(term) + self.assertIsNone(ret["json"]) + self.assertIsNotNone(ret["error_msg"]) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + f"entries?entry_id={term.term_id}" + ), + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "entry_type": "glossary_term", + "display_name": "Term 1", + "core_aspects": { + "business_context": { + "aspect_type": "business_context", + "json_content": {"description": "Desc1", "contacts": []}, + } + }, + "core_relationships": { + "relationship_type": "is_child_of", + "destination_entry_name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists" + ), + }, + }, + ) def test_glossary_described_by_relation_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term("Term 1", "Desc1") + asset_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" + "entries/test_asset" + ) + dest_entry_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" + f"entries/{term.term_id}" + ) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + + glossary._create_relationship( + asset_name, + EntryType.EntryType.TERM, + term.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.DESCRIBED, + ) + + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/test_asset/relationships" ), - mock.patch( - "api_call_utils.requests.post", - side_effect=mocks.mocked_post_failed_api_response) as post_mock, - ): - term = bg_term.Term("Term 1", "Desc1") - asset_name = ( - "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" - "entries/test_asset" - ) - dest_entry_name = ( - "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" - f"entries/{term.term_id}" - ) - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = {term.display_name: term} - - glossary._create_relationship( - asset_name, - term.display_name, - dc_glossary.RelationshipType.DESCRIBED - ) - - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/test_asset/relationships" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "relationship_type": dc_glossary.RelationshipType.DESCRIBED.value, - "destination_entry_name": dest_entry_name, - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": relation_type.RelationshipType.DESCRIBED.value, + "destination_entry_name": dest_entry_name, + }, + ) def test_glossary_described_by_relation_with_column_is_created(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, + term = bg_term.Term("Term 1", "Desc1") + asset_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" + "entries/test_asset:subcolumn" + ) + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + glossary._create_relationship( + asset_name, + EntryType.EntryType.TERM, + term.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.DESCRIBED, + ) + dest_entry_name = ( + "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" + f"entries/{term.term_id}" + ) + self.post_mock.assert_called_with( + ( + "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/entries/test_asset/relationships" ), - mock.patch( - "api_call_utils.requests.post", - side_effect=mocks.mocked_post_failed_api_response) as post_mock, - ): - term = bg_term.Term("Term 1", "Desc1") - asset_name = ( - "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" - "entries/test_asset:subcolumn" - ) - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = {term.display_name: term} - glossary._create_relationship( - asset_name, - term.display_name, - dc_glossary.RelationshipType.DESCRIBED - ) - dest_entry_name = ( - "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" - f"entries/{term.term_id}" - ) - post_mock.assert_called_with( - ( - "https://datacatalog.googleapis.com/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/entries/test_asset/relationships" - ), - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer None", - "X-Goog-User-Project": "123", - }, - json={ - "relationship_type": dc_glossary.RelationshipType.DESCRIBED.value, - "destination_entry_name": dest_entry_name, - "source_column": "subcolumn", - }, - ) + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer None", + "X-Goog-User-Project": "123", + }, + json={ + "relationship_type": relation_type.RelationshipType.DESCRIBED.value, + "destination_entry_name": dest_entry_name, + "source_column": "subcolumn", + }, + ) def test_glossary_self_relation_error(self): - with ( - mock.patch( - "api_call_utils.requests.get", - side_effect=mocks.mocked_get_api_response, - ) - ): - term = bg_term.Term("Term 1", "Desc1") - glossary_id = glossary_identification.GlossaryId( - "123", - "us", - "test_entry_group_with_terms", - "glossary_exists" - ) - glossary = dc_glossary.Glossary(glossary_id) - glossary._term_cache = {term.display_name: term} - err = glossary._create_relationship( - term.display_name, - term.display_name, - dc_glossary.RelationshipType.RELATED - ) - self.assertIsNotNone(err) - self.assertEqual(err.resources, [term.display_name, term.display_name]) - self.assertEqual( - err.operation, - f"create_{dc_glossary.RelationshipType.RELATED.value}_relationship" - ) + term = bg_term.Term("Term 1", "Desc1") + glossary_id = glossary_identification.GlossaryId( + "123", "us", "test_entry_group_with_terms", "glossary_exists" + ) + glossary = dc_glossary.Glossary(glossary_id) + glossary._term_cache = {term.display_name: term} + err = glossary._create_relationship( + term.display_name, + EntryType.EntryType.TERM, + term.display_name, + EntryType.EntryType.TERM, + relation_type.RelationshipType.RELATED, + ) + self.assertIsNotNone(err) + self.assertEqual(err.resources, [term.display_name, term.display_name]) + self.assertEqual( + err.operation, + f"create_{relation_type.RelationshipType.RELATED.value}_relationship_validation", + ) def test_glossary_parse_entry_path_with_source_column(self): asset_name = ( @@ -499,7 +1073,7 @@ def test_glossary_parse_entry_path_with_source_column(self): ( "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" "entries/test_asset" - ) + ), ) self.assertEqual(source_column, "subcolumn") @@ -538,7 +1112,7 @@ def test_glossary_parse_entry_source_column_with_multiple_colons(self): ( "projects/123/locations/us/entryGroups/test_entry_group_with_terms/" "entries/test_asset" - ) + ), ) self.assertEqual(source_column, "subcolumn:with:colons") diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/parse_utils_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/parse_utils_test.py index 0ee973c..b1e5644 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/parse_utils_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/parse_utils_test.py @@ -1,78 +1,151 @@ import unittest from parameterized import parameterized +import entry_type as entry_type_lib import parse_utils class ParseUtilsTest(unittest.TestCase): + @parameterized.expand([ + (entry_type_lib.EntryType.CATEGORY, "", [""]), + (entry_type_lib.EntryType.CATEGORY, " name ", ["name"]), + (entry_type_lib.EntryType.CATEGORY, "Data Steward", ["Data Steward"]), + (entry_type_lib.EntryType.TERM, "", [""]), + (entry_type_lib.EntryType.TERM, " name ", ["name"]), + (entry_type_lib.EntryType.TERM, "Data Steward", ["Data Steward"]), + ]) + def test_parse_data_stewards( + self, entry_type: entry_type_lib.EntryType, text: str, expected: list[str] + ): + ds, errors = parse_utils._parse_data_stewards(entry_type, text) + self.assertEqual(len(errors), 0) + self.assertEqual(ds, expected) + + @parameterized.expand([ + ("", [""]), + (" name ", ["name"]), + ("Data Steward",["Data Steward"]), + ]) + def test_parse_category_data_stewards(self, text: str, expected: list[str]): + ds, errors = parse_utils.parse_category_data_stewards(text) + self.assertEqual(len(errors), 0) + self.assertEqual(ds, expected) + @parameterized.expand([ ("", [""]), (" name ", ["name"]), - ("Data Steward", ["Data Steward"]), + ( + "Data Steward", + ["Data Steward"], + ), ]) - def test_parse_data_stewards(self, text: str, expected: list[str]): - ds, errors = parse_utils.parse_data_stewards(text) + def test_parse_terms_data_stewards(self, text: str, expected: list[str]): + ds, errors = parse_utils.parse_term_data_stewards(text) self.assertEqual(len(errors), 0) self.assertEqual(ds, expected) + @parameterized.expand([ + (entry_type_lib.EntryType.CATEGORY, ""), + (entry_type_lib.EntryType.CATEGORY, " "), + (entry_type_lib.EntryType.CATEGORY, " "), + (entry_type_lib.EntryType.CATEGORY, '" "'), + (entry_type_lib.EntryType.CATEGORY, '" "'), + (entry_type_lib.EntryType.TERM, ""), + (entry_type_lib.EntryType.TERM, " "), + (entry_type_lib.EntryType.TERM, " "), + (entry_type_lib.EntryType.TERM, '" "'), + (entry_type_lib.EntryType.TERM, '" "'), + ]) + def test_parse_empty_string( + self, entry_type: entry_type_lib.EntryType, text: str + ): + ds, errors = parse_utils._parse_str(entry_type, text) + self.assertEqual(len(ds), 0) + self.assertEqual(len(errors), 0) + + @parameterized.expand([ + (""), + (" "), + (" "), + ('" "'), + ('" "'), + ]) + def test_parse_category_string(self, text: str): + ds, errors = parse_utils.parse_category_str(text) + self.assertEqual(len(ds), 0) + self.assertEqual(len(errors), 0) + @parameterized.expand([ (""), (" "), (" "), - ("\" \""), - ("\" \"") + ('" "'), + ('" "'), ]) - def test_parse_empty_string(self, text: str): - ds, errors = parse_utils.parse_str(text) + def test_parse_term_string(self, text: str): + ds, errors = parse_utils.parse_term_str(text) self.assertEqual(len(ds), 0) self.assertEqual(len(errors), 0) - def test_parse_data_stewards_2(self): - ds, errors = parse_utils.parse_data_stewards( - "Data Steward I, Data Steward II," - " " + @parameterized.expand([ + (entry_type_lib.EntryType.CATEGORY,), + (entry_type_lib.EntryType.TERM,), + ]) + def test_parse_data_stewards_2(self, entry_type: entry_type_lib.EntryType): + ds, errors = parse_utils._parse_data_stewards( + entry_type, + "Data Steward I, Data Steward II," + " ", ) self.assertEqual(len(errors), 0) self.assertEqual( ds, [ - "Data Steward I", - "Data Steward II", - "", + "Data Steward I", + "Data Steward II", + "", ], ) @parameterized.expand([ - (""), - (" "), - (","), - (" , , "), + (entry_type_lib.EntryType.CATEGORY, ""), + (entry_type_lib.EntryType.CATEGORY, " "), + (entry_type_lib.EntryType.CATEGORY, ","), + (entry_type_lib.EntryType.CATEGORY, " , , "), + (entry_type_lib.EntryType.TERM, ""), + (entry_type_lib.EntryType.TERM, " "), + (entry_type_lib.EntryType.TERM, ","), + (entry_type_lib.EntryType.TERM, " , , "), ]) - def test_parse_data_stewards_empty(self, text: str): - ds, errors = parse_utils.parse_data_stewards(text) + def test_parse_data_stewards_empty( + self, entry_type: entry_type_lib.EntryType, text: str + ): + ds, errors = parse_utils._parse_data_stewards(entry_type, text) self.assertEqual(len(errors), 0) self.assertEqual(len(ds), 0) - def test_parse_data_stewards_errors(self): - ds, errors = parse_utils.parse_data_stewards( - "no_email, ok, x, <>" + @parameterized.expand([ + (entry_type_lib.EntryType.CATEGORY,), + (entry_type_lib.EntryType.TERM,), + ]) + def test_parse_data_stewards_errors( + self, entry_type: entry_type_lib.EntryType + ): + ds, errors = parse_utils._parse_data_stewards( + entry_type, "no_email, ok, x, <>" ) self.assertEqual(len(errors), 4) self.assertEqual(ds, ["ok"]) def test_parse_string_list(self): - ret, errors = parse_utils.parse_list( - "term 1, term 2, term 3" - ) + ret, errors = parse_utils.parse_list("term 1, term 2, term 3") self.assertEqual(len(errors), 0) self.assertEqual(ret, ["term 1", "term 2", "term 3"]) def test_parse_string_list_with_delimiter_character(self): - ret, errors = parse_utils.parse_list( - "term 1, term 2, term 3, \"term, 4\"," - ) + ret, errors = parse_utils.parse_list('term 1, term 2, term 3, "term, 4",') self.assertEqual(len(errors), 0) self.assertEqual(ret, ["term 1", "term 2", "term 3", "term, 4"]) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/term_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/term_test.py index 5eba269..0aae320 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/term_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/term_test.py @@ -65,6 +65,32 @@ def test_mutable_fields_not_shared(self): self.assertEqual(len(term1.data_stewards), 1) self.assertEqual(len(term2.data_stewards), 0) + def test_create_from_json(self): + term_entry = { + "name": "projects/123/locations/us/entryGroups/glossary_with_terms/entries/pii_data_xyz", + "displayName": "PII Data", + "coreAspects": { + "business_context": { + "jsonContent": { + "description": "Personally Identifiable Information Data" + } + } + }, + } + expected_term = bg_term.Term( + "PII Data", + "Personally Identifiable Information Data", + force_term_id="pii_data_xyz", + ) + + actual_term = bg_term.Term.from_dict(term_entry) + self.assertEqual( + actual_term.display_name, expected_term.display_name + ) + self.assertEqual(actual_term.description, expected_term.description) + self.assertEqual( + actual_term.term_id, expected_term.term_id + ) if __name__ == "__main__": unittest.main() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/csv_parser_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py similarity index 70% rename from dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/csv_parser_test.py rename to dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py index dbdffae..af9b624 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/csv_parser_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py @@ -1,11 +1,12 @@ import unittest from unittest import mock -import csv_parser +import entry_type +import terms_csv_parser from parameterized import parameterized -class CsvReaderTest(unittest.TestCase): +class TermsCsvReaderTest(unittest.TestCase): @parameterized.expand([ ("term,description,name,,,"), @@ -13,7 +14,7 @@ class CsvReaderTest(unittest.TestCase): ]) def test_read_glossary_csv_single_line(self, content): with mock.patch("builtins.open", mock.mock_open(read_data=content)): - terms, errors, lines_read = csv_parser.parse_glossary_csv("") + terms, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 0) self.assertEqual(terms[1].display_name, "term") self.assertEqual(terms[1].description, "description") @@ -23,12 +24,12 @@ def test_read_glossary_csv_single_line(self, content): def test_read_glossary_csv_multi_line(self): content = ( '"term1","description1","",,,\n' - "term2,description2,,,,\n" - 'term3,description3,"name3, name33",,,' + "term2,description2,,,,,category2\n" + 'term3,description3,"name3, name33",,,,category3' ) with mock.patch("builtins.open", mock.mock_open(read_data=content)): - terms, errors, lines_read = csv_parser.parse_glossary_csv("") + terms, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 0) self.assertEqual(terms[1].display_name, "term1") self.assertEqual(terms[2].display_name, "term2") @@ -39,8 +40,12 @@ def test_read_glossary_csv_multi_line(self): self.assertEqual(terms[1].data_stewards, [""]) self.assertEqual(len(terms[2].data_stewards), 0) self.assertEqual( - terms[3].data_stewards, ["name3", "name33"], + terms[3].data_stewards, + ["name3", "name33"], ) + self.assertEqual(terms[1].belongs_to_category, "") + self.assertEqual(terms[2].belongs_to_category, "category2") + self.assertEqual(terms[3].belongs_to_category, "category3") self.assertEqual(lines_read, 3) def test_read_glossary_csv_errors(self): @@ -51,15 +56,15 @@ def test_read_glossary_csv_errors(self): "term2,description2,,,," ) - with mock.patch( - "builtins.open", mock.mock_open(read_data=content) - ): - terms, errors, lines_read = csv_parser.parse_glossary_csv("") + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + terms, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 2) # Steward is invalid + self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 2) self.assertEqual(errors[0].column, 3) # Description is missing + self.assertEqual(errors[1].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[1].line, 3) self.assertEqual(errors[1].column, 2) self.assertEqual(terms[1].display_name, "term1") @@ -73,7 +78,7 @@ def test_read_glossary_csv_errors(self): def test_read_glossary_csv_empty_lines(self): content = "\n\n\n" with mock.patch("builtins.open", mock.mock_open(read_data=content)): - terms, errors, lines_read = csv_parser.parse_glossary_csv("") + terms, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 0) self.assertEqual(len(terms), 0) self.assertEqual(lines_read, 0) @@ -82,11 +87,10 @@ def test_read_glossary_csv_duplicate_errors(self): content = """term 1,description1,,,, Term 1,description2,,,,""" - with mock.patch( - "builtins.open", mock.mock_open(read_data=content) - ): - _, errors, lines_read = csv_parser.parse_glossary_csv("") + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 2) self.assertEqual(errors[0].column, 1) self.assertEqual(errors[0].message, "The term is duplicated in the CSV.") @@ -97,13 +101,15 @@ def test_read_glossary_csv_empty_display_name(self): \" \",description2,,,,""" with mock.patch("builtins.open", mock.mock_open(read_data=content)): - _, errors, lines_read = csv_parser.parse_glossary_csv("") + _, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 2) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 1) self.assertEqual(errors[0].column, 1) self.assertEqual( errors[0].message, "The display name for the term is empty." ) + self.assertEqual(errors[1].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[1].line, 2) self.assertEqual(errors[1].column, 1) self.assertEqual( @@ -115,22 +121,22 @@ def test_read_glossary_csv_large_display_name(self): name = "Test" * 51 content = f"""{name},description1,,,,""" - with mock.patch( - "builtins.open", mock.mock_open(read_data=content) - ): - _, errors, lines_read = csv_parser.parse_glossary_csv("") + with mock.patch("builtins.open", mock.mock_open(read_data=content)): + _, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 1) self.assertEqual(errors[0].column, 1) self.assertEqual(errors[0].message, "The term's display name is too big.") self.assertEqual(lines_read, 1) def test_read_glossary_csv_non_allowed_character(self): - content = """\"display\n name\",description1,,,,""" + content = """\"display\n name\",description1,,,,""" with mock.patch("builtins.open", mock.mock_open(read_data=content)): - _, errors, lines_read = csv_parser.parse_glossary_csv("") + _, errors, lines_read = terms_csv_parser.parse_glossary_csv("") self.assertEqual(len(errors), 1) + self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 1) self.assertEqual(errors[0].column, 1) self.assertEqual( diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/test_utils/mocks.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/test_utils/mocks.py index a5b9840..ac1e367 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/test_utils/mocks.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/test_utils/mocks.py @@ -63,11 +63,32 @@ def mocked_get_api_response(url, headers=None, json=None): # pylint: disable=un r".+/v2/projects/123/locations/us/entryGroups /test_entry_group_with_terms/entries/glossary_exists Glossary entry exists and there are terms associated to it. + 9. ListEntries: + r".+/v2/projects/.+/locations/.+/entryGroups + /test_entry_group_with_categories_and_terms/entries?view=FULL$" + Glossary entry exists and there are categories and terms associated to + it. + 10. GetEntry: + r".+/v2/projects/123/locations/us/entryGroups + /test_entry_group_with_categories_and_terms/entries/glossary_not_empty + Glossary entry exists and there are categories and terms associated to + it. + 11. ListEntries: + r".+/v2/projects/.+/locations/.+/entryGroups + /test_entry_group_with_categories/entries?view=FULL$" + Glossary entry exists and there are categories and terms associated to + it. + 12. GetEntry: + r".+/v2/projects/123/locations/us/entryGroups + /test_entry_group_with_categories/entries/glossary_not_empty + Glossary entry exists and there are categories and terms associated to + it. Args: url: str headers: Dict json: Dict + Returns: MockResponse(dict, status_code) """ @@ -79,7 +100,7 @@ def mocked_get_api_response(url, headers=None, json=None): # pylint: disable=un return MockResponse({"error": error_response}, 400) elif url == ( "https://datacatalog.googleapis.com/v2/get_call/error_with_exception" - ): + ): return MockThrowingJSONDecodeError("", 200) elif re.fullmatch( ( @@ -87,24 +108,24 @@ def mocked_get_api_response(url, headers=None, json=None): # pylint: disable=un "entryGroups/test_entry_group_with_no_glossary/" "entries/glossary_not_found" ), - url): - return MockResponse({ - "error": { - "code": 404, - "message": "Requested entity was not found.", - "status": "NOT_FOUND", - "details": [ - { + url, + ): + return MockResponse( + { + "error": { + "code": 404, + "message": "Requested entity was not found.", + "status": "NOT_FOUND", + "details": [{ "@type": "type.googleapis.com/google.rpc.ErrorInfo", "reason": "notFound", "domain": "datacatalog.googleapis.com", - "metadata": { - "code": "ENTRY_NOT_FOUND" - } - } - ] - } - }, 404) + "metadata": {"code": "ENTRY_NOT_FOUND"}, + }], + } + }, + 404, + ) # List entry group with no business terms, but glossary entry existing elif re.fullmatch( @@ -112,10 +133,123 @@ def mocked_get_api_response(url, headers=None, json=None): # pylint: disable=un ".+/v2/projects/.+/locations/.+/" r"entryGroups/test_entry_group_with_no_terms/entries\?view=FULL$" ), - url): - return MockResponse({ - "entries": - [ + url, + ): + return MockResponse( + { + "entries": [{ + "name": "dc_glossary_test", + "displayName": "Glossary 2", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_no_terms/" + "entries/empty_glossary_exists/" + "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eEmpty glossary\u003c/p\u003e" + ) + }, + "createTime": "2023-05-11T17:18:00.838415Z", + "modifyTime": "2023-05-12T08:32:21.859231Z", + } + }, + "createTime": "2023-04-26T07:30:05.022015Z", + "modifyTime": "2023-04-26T07:30:05.022015Z", + "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", + }] + }, + 200, + ) + # Get glossary entry + elif re.fullmatch( + ( + ".+/v2/projects/.+/locations/.+/" + "entryGroups/test_entry_group_with_no_terms/" + "entries/empty_glossary_exists" + ), + url, + ): + return MockResponse( + { + "name": "dc_glossary_test", + "displayName": "Glossary 2", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_no_terms/" + "entries/empty_glossary_exists/" + "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eEmpty glossary\u003c/p\u003e" + ) + }, + "createTime": "2023-05-11T17:18:00.838415Z", + "modifyTime": "2023-05-12T08:32:21.859231Z", + } + }, + "createTime": "2023-04-26T07:30:05.022015Z", + "modifyTime": "2023-04-26T07:30:05.022015Z", + "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", + }, + 200, + ) + + elif re.fullmatch( + ( + ".+/v2/projects/.+/locations/.+/" + r"entryGroups/test_entry_group_with_terms/entries\?view=FULL$" + ), + url, + ): + return MockResponse( + { + "entries": [ + { + "name": "purchase_numberxswfrh", + "displayName": "Purchase number", + "entryType": "glossary_term", + "createTime": "2023-04-26T07:30:05.022015Z", + "modifyTime": "2023-04-26T07:30:05.022015Z", + "entryUid": "c1df00c7-2e2f-4d47-9d1c-d16a9dfbb7a9", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/purchase_numberxswfrh/" + "aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003ePurchase number" + "description\u003c/p\u003e" + ) + }, + "createTime": "2023-05-11T17:18:00.838415Z", + "modifyTime": "2023-05-12T08:32:21.859231Z", + } + }, + "coreRelationships": [{ + "name": "projects/2e64c8013eb2c67a0ee2e", + "relationshipType": "is_child_of", + "destinationEntryName": ( + "entries/71372af7-bb1a-4020-aba8-223c57c366d2" + ), + }], + }, { "name": "dc_glossary_test", "displayName": "Glossary 2", @@ -124,169 +258,324 @@ def mocked_get_api_response(url, headers=None, json=None): # pylint: disable=un "business_context": { "name": ( "projects/123/locations/us/" - "entryGroups/test_entry_group_with_no_terms/" - "entries/empty_glossary_exists/" - "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists/" + "aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" ), "aspectType": "business_context", "jsonContent": { "description": ( - "\u003cp\u003eEmpty glossary\u003c/p\u003e" + "\u003cp\u003eGlossary with" + "terms.\u003c/p\u003e" ) }, "createTime": "2023-05-11T17:18:00.838415Z", - "modifyTime": "2023-05-12T08:32:21.859231Z" + "modifyTime": "2023-05-12T08:32:21.859231Z", } }, "createTime": "2023-04-26T07:30:05.022015Z", "modifyTime": "2023-04-26T07:30:05.022015Z", "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", - } + }, ] - }, 200) - # Get glossary entry + }, + 200, + ) + elif re.fullmatch( + ( + ".+/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/entries/glossary_exists" + ), + url, + ): + return MockResponse( + { + "name": "dc_glossary_test", + "displayName": "Glossary 2", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/" + "entryGroups/test_entry_group_with_terms/" + "entries/glossary_exists/" + "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eGlossary with terms.\u003c/p\u003e" + ) + }, + "createTime": "2023-05-11T17:18:00.838415Z", + "modifyTime": "2023-05-12T08:32:21.859231Z", + } + }, + "createTime": "2023-04-26T07:30:05.022015Z", + "modifyTime": "2023-04-26T07:30:05.022015Z", + "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", + }, + 200, + ) elif re.fullmatch( ( ".+/v2/projects/.+/locations/.+/" - "entryGroups/test_entry_group_with_no_terms/" - "entries/empty_glossary_exists" + r"entryGroups/test_entry_group_with_categories_and_terms/entries\?view=FULL$" ), - url): - return MockResponse({ - "name": "dc_glossary_test", - "displayName": "Glossary 2", - "entryType": "glossary", - "coreAspects": { - "business_context": { - "name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_no_terms/" - "entries/empty_glossary_exists/" - "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" - ), - "aspectType": "business_context", - "jsonContent": { - "description": "\u003cp\u003eEmpty glossary\u003c/p\u003e" + url, + ): + return MockResponse( + { + "entries": [ + { + "name": "PII_data", + "displayName": "PII data", + "entryType": "glossary_category", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories_and_terms/" + "entries/PII_data/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003ePII data" + " description\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:45.771375Z", + "modifyTime": "2023-08-29T07:36:45.771375Z", + } + }, + "createTime": "2023-08-29T07:36:45.771375Z", + "modifyTime": "2023-08-29T07:36:45.771375Z", + "entryUid": "2352bc5f-2fae-4350-86be-1708f1b04b27", + "coreRelationships": [{ + "name": "projects/3486de3ed96e322999e2c3b66ab0eb94", + "relationshipType": "is_child_of", + "destinationEntryName": ( + "entries/8f130395-ca37-4e99-b0cc-0ac975e66607" + ), + }], }, - "createTime": "2023-05-11T17:18:00.838415Z", - "modifyTime": "2023-05-12T08:32:21.859231Z" - } + { + "name": "FirstName", + "displayName": "First Name", + "entryType": "glossary_term", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories_and_terms/" + "entries/FirstName/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eFirst name" + " description\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:44.459135Z", + "modifyTime": "2023-08-29T07:36:44.459135Z", + } + }, + "createTime": "2023-08-29T07:36:44.459135Z", + "modifyTime": "2023-08-29T07:36:44.459135Z", + "entryUid": "c57bd02e-f55b-4226-b8af-2061a28f4cee", + "coreRelationships": [{ + "name": "projects/4c7c287071346e4f096d552172e28a08", + "relationshipType": "is_child_of", + "destinationEntryName": ( + "entries/8f130395-ca37-4e99-b0cc-0ac975e66607" + ), + }], + }, + { + "name": "glossary_not_empty", + "displayName": "glossary_not_empty", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories_and_terms/" + "entries/glossary_not_empty/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eGlossary with categories and" + "terms.\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T07:36:44.155839Z", + } + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T08:54:29.506959Z", + "entryUid": "8f130395-ca37-4e99-b0cc-0ac975e66607", + }, + ] }, - "createTime": "2023-04-26T07:30:05.022015Z", - "modifyTime": "2023-04-26T07:30:05.022015Z", - "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", - }, 200) - + 200, + ) + elif re.fullmatch( + ( + ".+/v2/projects/123/locations/us/" + "entryGroups/test_entry_group_with_categories_and_terms/entries/glossary_not_empty" + ), + url, + ): + return MockResponse( + { + "name": "glossary_not_empty", + "displayName": "glossary_not_empty", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories_and_terms/entries/" + "glossary_not_empty/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eGlossary with categories and" + "terms.\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T07:36:44.155839Z", + } + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T08:54:29.506959Z", + "entryUid": "8f130395-ca37-4e99-b0cc-0ac975e66607", + }, + 200, + ) elif re.fullmatch( ( ".+/v2/projects/.+/locations/.+/" - r"entryGroups/test_entry_group_with_terms/entries\?view=FULL$" + r"entryGroups/test_entry_group_with_categories/entries\?view=FULL$" ), - url): + url, + ): return MockResponse( { - "entries": - [ - { - "name": "purchase_numberxswfrh", - "displayName": "Purchase number", - "entryType": "glossary_term", - "createTime": "2023-04-26T07:30:05.022015Z", - "modifyTime": "2023-04-26T07:30:05.022015Z", - "entryUid": "c1df00c7-2e2f-4d47-9d1c-d16a9dfbb7a9", - "coreAspects": { - "business_context": { - "name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/purchase_numberxswfrh/" - "aspects/" - "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" - ), - "aspectType": "business_context", - "jsonContent": { - "description": ( - "\u003cp\u003ePurchase number" - "description\u003c/p\u003e" - ) - }, - "createTime": "2023-05-11T17:18:00.838415Z", - "modifyTime": "2023-05-12T08:32:21.859231Z" - } - }, - "coreRelationships": [ - { - "name": "projects/2e64c8013eb2c67a0ee2e", - "relationshipType": "is_child_of", - "destinationEntryName": ( - "entries/" - "71372af7-bb1a-4020-aba8-223c57c366d2" + "entries": [ + { + "name": "PII_data", + "displayName": "PII data", + "entryType": "glossary_category", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories/" + "entries/PII_data/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003ePII data" + " description\u003c/p\u003e" ) - } - ] + }, + "createTime": "2023-08-29T07:36:45.771375Z", + "modifyTime": "2023-08-29T07:36:45.771375Z", + } }, - { - "name": "dc_glossary_test", - "displayName": "Glossary 2", - "entryType": "glossary", - "coreAspects": { - "business_context": { - "name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists/" - "aspects/" - "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" - ), - "aspectType": "business_context", - "jsonContent": { - "description": ( - "\u003cp\u003eGlossary with" - "terms.\u003c/p\u003e" - ) - }, - "createTime": "2023-05-11T17:18:00.838415Z", - "modifyTime": "2023-05-12T08:32:21.859231Z" - } - }, - "createTime": "2023-04-26T07:30:05.022015Z", - "modifyTime": "2023-04-26T07:30:05.022015Z", - "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", - } - ] - }, 200) + "createTime": "2023-08-29T07:36:45.771375Z", + "modifyTime": "2023-08-29T07:36:45.771375Z", + "entryUid": "2352bc5f-2fae-4350-86be-1708f1b04b27", + "coreRelationships": [{ + "name": "projects/3486de3ed96e322999e2c3b66ab0eb94", + "relationshipType": "is_child_of", + "destinationEntryName": ( + "entries/8f130395-ca37-4e99-b0cc-0ac975e66607" + ), + }], + }, + { + "name": "glossary_not_empty", + "displayName": "glossary_not_empty", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories/" + "entries/glossary_not_empty/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eGlossary with categories and" + "terms.\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T07:36:44.155839Z", + } + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T08:54:29.506959Z", + "entryUid": "8f130395-ca37-4e99-b0cc-0ac975e66607", + }, + ] + }, + 200, + ) elif re.fullmatch( ( ".+/v2/projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/entries/glossary_exists" + "entryGroups/test_entry_group_with_categories/entries/glossary_not_empty" ), - url): - return MockResponse({ - "name": "dc_glossary_test", - "displayName": "Glossary 2", - "entryType": "glossary", - "coreAspects": { - "business_context": { - "name": ( - "projects/123/locations/us/" - "entryGroups/test_entry_group_with_terms/" - "entries/glossary_exists/" - "aspects/3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" - ), - "aspectType": "business_context", - "jsonContent": { - "description": ( - "\u003cp\u003eGlossary with terms.\u003c/p\u003e" - ) - }, - "createTime": "2023-05-11T17:18:00.838415Z", - "modifyTime": "2023-05-12T08:32:21.859231Z" - } + url, + ): + return MockResponse( + { + "name": "glossary_not_empty", + "displayName": "glossary_not_empty", + "entryType": "glossary", + "coreAspects": { + "business_context": { + "name": ( + "projects/123/locations/us/entryGroups/" + "test_entry_group_with_categories/entries/" + "glossary_not_empty/aspects/" + "3f6ee7a1-07d3-4d2b-a76a-7f4d06aaa34e" + ), + "aspectType": "business_context", + "jsonContent": { + "description": ( + "\u003cp\u003eGlossary with" + "categories\u003c/p\u003e" + ) + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T07:36:44.155839Z", + } + }, + "createTime": "2023-08-29T07:36:44.155839Z", + "modifyTime": "2023-08-29T08:54:29.506959Z", + "entryUid": "8f130395-ca37-4e99-b0cc-0ac975e66607", }, - "createTime": "2023-04-26T07:30:05.022015Z", - "modifyTime": "2023-04-26T07:30:05.022015Z", - "entryUid": "71372af7-bb1a-4020-aba8-223c57c366d2", - }, 200) + 200, + ) def mocked_post_failed_api_response(url, headers=None, json=None): # pylint: disable=unused-argument diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py index 78acbc5..b627df2 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py @@ -1,23 +1,48 @@ """User report generation.""" +import category as bg_category +import entry_type as entry_type_lib import error -import glossary import logging_utils +import relation_type import term as bg_term +logger = logging_utils.get_logger() +entries_name_type_pairs = [ + ("terms", entry_type_lib.EntryType.TERM), + ("categories", entry_type_lib.EntryType.CATEGORY), +] -def print_report(csv_terms: int, - terms_created: list[bg_term.Term], - relations_created: list[ - tuple[str, str, glossary.RelationshipType] - ], - errors: list[error.TermImportError]) -> None: - """Implements the User Report generated after Term import. + +# Typo erronous +def print_report_for_erronous_categories_import( + imported_categories: list[bg_category.Category], + categories_import_errors: list[error.EntryImportError], +) -> None: + + logger.info("Categories import errors:") + for err in categories_import_errors: + logger.error(err.to_string()) + _print_imported_categories(imported_categories) + + +def print_report( + lines_read: dict[entry_type_lib.EntryType, int], + imported_entries: dict[ + entry_type_lib.EntryType, list[bg_term.Term | bg_category.Category] + ], + imported_relations: dict[ + entry_type_lib.EntryType, + list[tuple[str, str, relation_type.RelationshipType]], + ], + import_errors: list[error.EntryImportError], +) -> None: + """Implements the User Report generated after Term and Categories import. The user report contains information on errors that happened during import as well as aggregated data. Error messages contain: - The line and column in the CSV input file that generates the issue. - - The display name of the term(s) and/or data assets involved in the error. + - The display name of the entry(ies) and/or data assets involved in the error. - The operation that generated the issue. - A useful error message explaining the issue in detail. @@ -25,34 +50,108 @@ def print_report(csv_terms: int, the user: - Successful term creations in the format (successful_term_imports / total_term_imports) + - Successful categories creations in the format + (successful_term_imports / total_term_imports) Args: - csv_terms: int - Number of terms in the original CSV. - terms_created: list[Term] - List of all terms that were successfully created. - relations_created: list[tuple[str, str, RelationshipType]] - List of all relations successfully created represented as the - source and destination assets and the relationship type. - errors: list[TermImportError] - List of all error instances that happened during term import. + lines_read: dict[entry_type_lib.EntryType, int] A dictionary mapping entry + type (Term, Category) to number of entries in the original CSV file. + imported_entries: dict[entry_type_lib.EntryType, list[bg_term.Term | + bg_category.Category]] A dictionary mapping entry type to list of all + entries (Terms or Categories) that were successfully created. + imported_relations: A dictionary mapping entry type to list[tuple[str, str, + RelationshipType]] - list of all relations successfully created + represented as the source and destination assets and the relationship + type. + import_errors: list[TermImportError] List of all error instances that + happened during term import. """ - logger = logging_utils.get_logger() - if errors: - logger.info("Import error report:") - for err in errors: + _print_import_errors(import_errors) + _print_imported_entries(imported_entries) + _print_imported_relations(imported_relations) + _print_statistics(lines_read, imported_entries) + + +def _print_import_errors(import_errors: list[error.EntryImportError]) -> None: + if import_errors: + logger.info("Import errors report:") + for err in import_errors: logger.error(err.to_string()) - if terms_created: - logger.info("Terms successfully imported:") - for term in sorted(terms_created, key=lambda t: t.display_name): - logger.info(f"\t- {term.display_name}") - if relations_created: - logger.info("Relationships successfully imported:") - for src, dst, relation_type in relations_created: - logger.info(f"\t- {src} {relation_type.value} {dst}") +def _print_imported_entries( + imported_entries: dict[ + entry_type_lib.EntryType, list[bg_term.Term | bg_category.Category] + ] +) -> None: + """Prints the imported entries (terms and categories). + + Args: + imported_entries: dict[entry_type_lib.EntryType, list[bg_term.Term | + bg_category.Category]] A dictionary mapping entry type to list of all + entries (Terms or Categories) that were successfully created. + """ + for entries_name, entry_type in entries_name_type_pairs: + if entry_type in imported_entries: + logger.info(f"{entries_name.capitalize()} successfully imported:") + for entry in sorted( + imported_entries[entry_type], key=lambda t: t.display_name + ): + logger.info(f'\t- "{entry.display_name}"') + - logger.info("Statistics of Imported terms:") - logger.info(f"Terms successfully created: {len(terms_created)}/" - f"{csv_terms}.") +def _print_imported_categories( + imported_categories: list[bg_category.Category], +) -> None: + logger.info("Categories successfully imported:") + for entry in sorted(imported_categories, key=lambda t: t.display_name): + logger.info(f'\t- "{entry.display_name}"') + + +def _print_imported_relations( + imported_relations: dict[ + entry_type_lib.EntryType, + list[tuple[str, str, relation_type.RelationshipType]], + ] +) -> None: + """Prints the imported relationships. + + Args: + imported_relations: A dictionary mapping entry type to list[tuple[str, str, + RelationshipType]] - list of all relations successfully created + represented as the source and destination assets and the relationship + type. + """ + for entries_name, entry_type in entries_name_type_pairs: + if entry_type in imported_relations and imported_relations[entry_type]: + logger.info( + f"Relationships successfully imported from {entries_name} csv:" + ) + for src, dst, relationship_type in imported_relations[entry_type]: + logger.info(f'\t- "{src}" {relationship_type.value} "{dst}"') + + +def _print_statistics( + lines_read: dict[entry_type_lib.EntryType, int], + imported_entries: dict[ + entry_type_lib.EntryType, list[bg_term.Term | bg_category.Category] + ], +) -> None: + """Prints statistics about imported entries. + + Args: + lines_read: dict[entry_type_lib.EntryType, int] A dictionary mapping entry + type (Term, Category) to number of entries in the original CSV file. + imported_entries: dict[entry_type_lib.EntryType, list[bg_term.Term | + bg_category.Category]] A dictionary mapping entry type to list of all + entries (Terms or Categories) that were successfully created. + """ + for entries_name, entry_type in entries_name_type_pairs: + if entry_type in lines_read and entry_type in imported_entries: + imported_entries_count = len(imported_entries[entry_type]) + parsed_entries_count = lines_read[entry_type] + logger.info(f"Statistics of Imported {entries_name}:") + logger.info( + f"{entries_name.capitalize()} successfully created:" + f" {imported_entries_count}/{parsed_entries_count}." + ) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py index 36705bf..bbe2478 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py @@ -6,6 +6,7 @@ import sys import error +import import_mode as import_mode_lib import logging_utils @@ -41,6 +42,16 @@ def get_arguments() -> argparse.Namespace: return parser.parse_args() +def get_import_mode(args: argparse.Namespace) -> import_mode_lib.ImportMode: + modes = [mode.value for mode in import_mode_lib.ImportMode] + import_mode = vars(args).get("import_mode") + + if import_mode and import_mode.lower() in modes: + return import_mode_lib.ImportMode(import_mode.lower()) + + return import_mode_lib.ImportMode.STRICT + + def end_program_execution() -> None: logger.warning("Program execution finished ahead of time due to errors.") sys.exit(1) @@ -53,9 +64,10 @@ def configure_argument_parser(parser: argparse.ArgumentParser) -> None: parser: argparse.ArgumentParser(). """ parser.add_argument( - "csv", - help="Path to the CSV file containing the data to import.", - metavar="[CSV file]", + "terms_csv_legacy", + help="Path to the CSV file containing the terms data to import.", + metavar="[Terms CSV file (legacy)]", + nargs="?", type=str, ) parser.add_argument( @@ -92,6 +104,18 @@ def configure_argument_parser(parser: argparse.ArgumentParser) -> None: type=str, required=True, ) + parser.add_argument( + "--categories-csv", + help="Path to the CSV file containing the categories data to import.", + metavar="[Categories CSV file]", + type=str, + ) + parser.add_argument( + "--terms-csv", + help="Path to the CSV file containing the terms data to import.", + metavar="[Terms CSV file]", + type=str, + ) parser.add_argument( "--import-mode", choices=["strict", "clear"], @@ -99,7 +123,7 @@ def configure_argument_parser(parser: argparse.ArgumentParser) -> None: type=str, help=( "Sets level of permissiviness with which the data is imported into" - " Data Catalog. The default value is \"strict\".:\n" + ' Data Catalog. The default value is "strict".:\n' "strict\tCheck if the target glossary does not contain any entries," " and if it does, stops executing the program.\n" "clear\tRemove all the pre-existing entries in the target glossary" @@ -112,10 +136,61 @@ def configure_argument_parser(parser: argparse.ArgumentParser) -> None: "If set, the program will finish its execution if there are any" " parsing errors without importing any terms." ), - action="store_true" + action="store_true", ) def display_parsing_errors(errors: list[error.ParseError]) -> None: for err in errors: logger.error(err.to_string()) + + +def validate_args(args: argparse.Namespace) -> None: + """Validates script run arguments. + + Args: + args: script run arguments + """ + + # Verify access token is available + if not access_token_exists(): + logger.error("Environment variable GCLOUD_ACCESS_TOKEN doesn't exist.") + sys.exit(1) + + # Verify that at least one csv parameter is provided + if ( + not args.terms_csv_legacy + and not args.categories_csv + and not args.terms_csv + ): + logger.error("At least one csv filepath parameter must be provided.") + sys.exit(1) + + # Verify only one terms csv is provided: + if args.terms_csv and args.terms_csv_legacy: + logger.error( + "At most one of --terms-csv and terms-csv_legacy should be provided." + ) + exit(1) + + _verify_csv_file_existence(args, "terms_csv_legacy") + _verify_csv_file_existence(args, "terms_csv", prefix="--") + _verify_csv_file_existence(args, "categories_csv", prefix="--") + + +def _verify_csv_file_existence( + args: argparse.Namespace, arg_name: str, prefix: str = "" +): + """Logs an error if the provided CSV file path doesn't exist. + + Args: + args: script run arguments + arg_name: CSV path argument + prefix: argument prefix e.g. for --terms_csv prefix="--" + """ + file_path = vars(args).get(arg_name) + if file_path and not csv_file_exists(file_path): + logger.error( + f"The provided {prefix}{arg_name} CSV file path doesn't exist." + ) + sys.exit(1) From 8f481d1ea987c60f8e44a3e6794f3d72401dbaa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20=C5=81awski?= Date: Thu, 28 Sep 2023 10:21:59 +0000 Subject: [PATCH 2/4] Add glossary categories to the chicago crimes glossary dataset --- .../chicago_crimes_glossary_categories.csv | 5 +++ ....csv => chicago_crimes_glossary_terms.csv} | 44 +++++++++---------- 2 files changed, 27 insertions(+), 22 deletions(-) create mode 100644 dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_categories.csv rename dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/{chicago_crimes_glossary.csv => chicago_crimes_glossary_terms.csv} (67%) diff --git a/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_categories.csv b/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_categories.csv new file mode 100644 index 0000000..25e46f1 --- /dev/null +++ b/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_categories.csv @@ -0,0 +1,5 @@ +"Crime Metadata","Aggregates categories related to crime such us Location Metadata, Calendar Data, IUCR Data and FBI Data categories, also aggregates some generic terms for crime e.g. case number.","Don Joe, John Doe",, +"Location Data","Aggregates terms related to crime location such as coordinates, address, ward, district, beat and location descritption.","Don Joe, John Doe","Crime Metadata", +"Calendar Data","Aggregates terms related to crime date such as year or exact date.","Don Joe, John Doe","Crime Metadata", +"IUCR Data","Aggregates terms related to Illinois Uniform Crim Reporting such as IUCR, primary type or description.","Don Joe, John Doe","Crime Metadata", +"FBI Data","Aggregates terms related to FBI - e.g. FBI Code',"Don Joe, John Doe","Crime Metadata", diff --git a/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary.csv b/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_terms.csv similarity index 67% rename from dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary.csv rename to dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_terms.csv index 5e901ee..0f1a2c9 100644 --- a/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary.csv +++ b/dataplex-quickstart-labs/00-resources/datasets/chicago-crimes/business_glossary/chicago_crimes_glossary_terms.csv @@ -1,22 +1,22 @@ -"ID", "Unique identifier for the record.",,,, -"Case Number", "The Chicago Police Department RD Number (Records Division Number), which is unique to the incident.",,,, -"Date", "Date when the incident occurred. this is sometimes a best estimate.",,,, -"Block", "The partially redacted address where the incident occurred, placing it on the same block as the actual address.",,,, -"IUCR", "The Illinois Unifrom Crime Reporting code. This is directly linked to the Primary Type and Description. See the list of IUCR codes at https://data.cityofchicago.org/d/c7ck-438e.",,,, -"Primary Type", "The primary description of the IUCR code.",,,,"IUCR" -"Description", "The secondary description of the IUCR code, a subcategory of the primary description.",,,,"IUCR" -"Location Description", "Description of the location where the incident occurred.",,,, -"Arrest", "Indicates whether an arrest was made.",,,, -"Domestic", "Indicates whether the incident was domestic-related as defined by the Illinois Domestic Violence Act.",,,, -"Beat", "Indicates the beat where the incident occurred. A beat is the smallest police geographic area – each beat has a dedicated police beat car. Three to five beats make up a police sector, and three sectors make up a police district. The Chicago Police Department has 22 police districts. See the beats at https://data.cityofchicago.org/d/aerh-rz74.",,,, -"District", "Indicates the police district where the incident occurred. See the districts at https://data.cityofchicago.org/d/fthy-xz3r.",,,,"Beat" -"Ward", "The ward (City Council district) where the incident occurred. See the wards at https://data.cityofchicago.org/d/sp34-6z76.",,,,"Beat, District" -"Community Area", "Indicates the community area where the incident occurred. Chicago has 77 community areas. See the community areas at https://data.cityofchicago.org/d/cauq-8yn6.",,,,"Beat, District, Ward" -"FBI Code", "Indicates the crime classification as outlined in the FBI's National Incident-Based Reporting System (NIBRS). See the Chicago Police Department listing of these classifications at http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.",,,, -"X Coordinate", "The x coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,, -"Y Coordinate", "The y coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"X Coordinate" -"Year", "Year the incident occurred.",,,, -"Updated On", "Date and time the record was last updated.",,,, -"Latitude", "The latitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,, -"Longitude", "The longitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"Latitude" -"Location", "The location where the incident occurred in a format that allows for creation of maps and other geographic operations on this data portal. This location is shifted from the actual location for partial redaction but falls on the same block.",,,, +"ID", "Unique identifier for the record.",,,,,"Crime Metadata", +"Case Number", "The Chicago Police Department RD Number (Records Division Number), which is unique to the incident.",,,,,"Crime Metadata", +"Date", "Date when the incident occurred. this is sometimes a best estimate.",,,,,"Calendar Data", +"Block", "The partially redacted address where the incident occurred, placing it on the same block as the actual address.",,,,,"Location Data", +"IUCR", "The Illinois Unifrom Crime Reporting code. This is directly linked to the Primary Type and Description. See the list of IUCR codes at https://data.cityofchicago.org/d/c7ck-438e.",,,,,"IUCR Data", +"Primary Type", "The primary description of the IUCR code.",,,,"IUCR","IUCR Data", +"Description", "The secondary description of the IUCR code, a subcategory of the primary description.",,,,"IUCR","IUCR Data", +"Location Description", "Description of the location where the incident occurred.",,,,,"Location Data", +"Arrest", "Indicates whether an arrest was made.",,,,,"Crime Metadata", +"Domestic", "Indicates whether the incident was domestic-related as defined by the Illinois Domestic Violence Act.",,,,,"Crime Metadata", +"Beat", "Indicates the beat where the incident occurred. A beat is the smallest police geographic area – each beat has a dedicated police beat car. Three to five beats make up a police sector, and three sectors make up a police district. The Chicago Police Department has 22 police districts. See the beats at https://data.cityofchicago.org/d/aerh-rz74.",,,,,"Location Data", +"District", "Indicates the police district where the incident occurred. See the districts at https://data.cityofchicago.org/d/fthy-xz3r.",,,,"Beat","Location Data", +"Ward", "The ward (City Council district) where the incident occurred. See the wards at https://data.cityofchicago.org/d/sp34-6z76.",,,,"Beat, District","Location Data", +"Community Area", "Indicates the community area where the incident occurred. Chicago has 77 community areas. See the community areas at https://data.cityofchicago.org/d/cauq-8yn6.",,,,"Beat, District, Ward","Location Data", +"FBI Code", "Indicates the crime classification as outlined in the FBI's National Incident-Based Reporting System (NIBRS). See the Chicago Police Department listing of these classifications at http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.",,,,,"FBI Data", +"X Coordinate", "The x coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data", +"Y Coordinate", "The y coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"X Coordinate","Location Data", +"Year", "Year the incident occurred.",,,,,"Calendar Data", +"Updated On", "Date and time the record was last updated.",,,,,"Crime Metadata", +"Latitude", "The latitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data", +"Longitude", "The longitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"Latitude","Location Data", +"Location", "The location where the incident occurred in a format that allows for creation of maps and other geographic operations on this data portal. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data", From 987dca785cf159de403ca24691f9f9e018acc4c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20=C5=81awski?= Date: Wed, 25 Oct 2023 11:50:06 +0000 Subject: [PATCH 3/4] Warn users when legacy terms csv argument is used. Deprecate --strict-parsing. Improve error messages and wording. --- .../bg_import/api_call_utils.py | 98 ++++++++++++++++++- .../bg_import/business_glossary_import.py | 17 ++-- .../bg_import/categories_csv_parser.py | 15 +-- .../bg_import/error.py | 11 ++- .../bg_import/glossary.py | 28 +++--- .../bg_import/terms_csv_parser.py | 15 +-- .../tests/categories_csv_parser_test.py | 2 +- .../bg_import/tests/terms_csv_parser_test.py | 4 +- .../bg_import/user_report.py | 2 +- .../bg_import/utils.py | 24 ++--- 10 files changed, 158 insertions(+), 58 deletions(-) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py index 79c7cb2..7b2c5d1 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py @@ -7,7 +7,6 @@ import logging_utils import requests - logging.getLogger('urllib3').setLevel(logging.WARNING) logger = logging_utils.get_logger() @@ -20,6 +19,101 @@ def _get_header(project_id: str) -> dict[str, str]: } +def extract_error_details( + response_err: requests.exceptions.RequestException, +) -> list[Any]: + """Extract error details from response error data. + + Args: + response_err: RequestException containing response error data. + + Returns: + List of error details dictionaries. + """ + if response_err.response is None: + return [] + + try: + data = response_err.response.json() + return data.get('error', dict()).get('details', []) + except requests.exceptions.JSONDecodeError: + return [] + + +def extract_debug_info_detail( + response_err: requests.exceptions.RequestException, +) -> str | None: + """Extract debug info details from response error data. + + Return None if no debug info detail found. + + Args: + response_err: RequestException containing response error data. + + Returns: + String representing debug infor detail or None if no debug info detail + found. + """ + for detail in extract_error_details(response_err): + if ( + detail.get('@type') == 'type.googleapis.com/google.rpc.DebugInfo' + and detail.get('detail') is not None + and not str(detail.get('detail')).isspace() + ): + return str(detail.get('detail')) + return None + + +def extract_error_code( + response_err: requests.exceptions.RequestException, +) -> str | None: + """Extract error code from response error data. + + Return None if no error code found. + + Args: + response_err: RequestException containing response error data. + + Returns: + String representing error code or None if no error code found. + """ + for detail in extract_error_details(response_err): + if ( + detail.get('@type') == 'type.googleapis.com/google.rpc.ErrorInfo' + and detail.get('metadata') is not None + and detail.get('metadata').get('code') is not None + and not str(detail.get('metadata').get('code')).isspace() + ): + return str(detail.get('metadata').get('code')) + + return None + + +def create_error_message( + method_name: str, + url: str, + response_err: requests.exceptions.RequestException, +) -> str: + """Create an error message. + + Args: + method_name: String containing a http method name. + url: String containing targeted url. + response_err: RequestException containing response error data. + + Returns: + String containing user friendly error message. + """ + base_err_description = str(response_err) + err_description = ( + extract_debug_info_detail(response_err) + or extract_error_code(response_err) + or base_err_description + ) + error_msg = f'{method_name} call to {url} returned: {err_description}' + return error_msg + + def fetch_api_response( method: Callable[..., Any], url: str, @@ -43,7 +137,7 @@ def fetch_api_response( res = method(url, headers=_get_header(project_id), json=request_body) res.raise_for_status() except requests.exceptions.RequestException as err: - error_msg = f'{method_name} call to {url} returned: {err}' + error_msg = create_error_message(method_name, url, err) return { 'json': data, 'error_msg': error_msg diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py index 4075753..744ed81 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/business_glossary_import.py @@ -39,7 +39,7 @@ def main() -> None: utils.end_program_execution() parsers_results = _parse_all_csv_files(args) - _print_parsing_errors(args, parsers_results) + _print_parsing_errors(parsers_results) if not glossary.is_glossary_empty(): _handle_non_empty_glossary(import_mode, glossary) @@ -110,7 +110,9 @@ def _import_glossary_entries( entries_to_import += "categories and " entries_to_import = entries_to_import.removesuffix(" and ") - logger.info("Importing CSV %s into Business Glossary...", entries_to_import) + logger.info( + "Importing CSV file %s into Business Glossary...", entries_to_import + ) return glossary.import_glossary( terms=parsed_terms, categories=parsed_categories ) @@ -150,14 +152,13 @@ def _handle_non_empty_glossary( def _print_parsing_errors( - args: argparse.Namespace, parsers_results: dict[entry_type_lib.EntryType, parser_types._ParserReturnType], ) -> None: if any_errors(parsers_results): for _, parse_errors, _ in parsers_results.values(): utils.display_parsing_errors(parse_errors) - if args.strict_parsing: - utils.end_program_execution() + + utils.end_program_execution() def _parse_all_csv_files( @@ -171,20 +172,20 @@ def _parse_all_csv_files( Returns: dictionary mapping EntryType to _ParserReturnType (a tuple of list of successfully parsed terms, a list of errors and the number of lines we read - in the CSV). + in the CSV file). """ parsers_results = {} terms_csv = ( args.terms_csv if args.terms_csv is not None else args.terms_csv_legacy ) if terms_csv: - logger.info("Parsing terms input CSV...") + logger.info("Parsing terms input CSV file...") parsers_results[entry_type_lib.EntryType.TERM] = ( terms_csv_parser.parse_glossary_csv(terms_csv) ) if args.categories_csv: - logger.info("Parsing categories input CSV...") + logger.info("Parsing categories input CSV file...") parsers_results[entry_type_lib.EntryType.CATEGORY] = ( categories_csv_parser.parse_glossary_csv(args.categories_csv) ) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py index 4164732..153a7d8 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/categories_csv_parser.py @@ -52,7 +52,7 @@ def parse_glossary_csv( Returns: _ParserReturnType - a tuple of list of successfully parsed categories, - a list of errors and the number of lines we read in the CSV. + a list of errors and the number of lines we read in the CSV file. """ categories = {} @@ -94,13 +94,13 @@ def _validate_category( """Validates a business glossary category. Performs the following tests: - - The category is unique in the CSV + - The category is unique in the CSV file - Display name is not empty - Description is not empty Args: category: Category - tracked_categories: Set of categories seen so far in the CSV + tracked_categories: Set of categories seen so far in the CSV file Returns: ParseErrors @@ -126,11 +126,11 @@ def _validate_category( errors.append(err) if category.display_name: - # If the category has appeared before in the CSV we record an error. + # If the category has appeared before in the CSV file we record an error. if category.display_name.lower() in tracked_categories: err = error.ParseError( entry_type.EntryType.CATEGORY, - message="The category is duplicated in the CSV.", + message="The category is duplicated in the CSV file.", column=1, resources=[category.display_name], ) @@ -164,8 +164,9 @@ def parse_category( """Parses a business glossary category. Args: - line_idx: Index of the line where the category appears in the CSV. - record: A list of category attributes in order conforming to the CSV schema. + line_idx: Index of the line where the category appears in the CSV file. + record: A list of category attributes in order conforming to the CSV file + schema. tracked_categories: Set of previously seen display names. Returns: diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py index 759b61f..7576958 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/error.py @@ -21,10 +21,10 @@ class Error(abc.ABC): """Base class for Error. Attributes: - entry_type: An enum containing the type of the record in the CSV (e.g. TERM, - CATEGORY). - line: An integer containing the line of the record in the CSV. - column: An integer containing the column of the error in the CSV. + entry_type: An enum containing the type of the record in the CSV file (e.g. + TERM, CATEGORY). + line: An integer containing the line of the record in the CSV file. + column: An integer containing the column of the error in the CSV file. resources: A list of the resources (terms, FQNs, entries, etc) that caused the error. message: An optional string indicating a fix for the error. @@ -131,7 +131,8 @@ def _add_record_information(self) -> str: class ParseError(Error): """Initializes an instance of ParseError. - ParseError objects are populated during the CSV parsing and validation phase. + ParseError objects are populated during the CSV file parsing and validation + phase. """ def __init__( diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py index 822ca2e..d27368b 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py @@ -295,29 +295,29 @@ def _is_relationship_valid( or dst_type != entry_type_lib.EntryType.TERM ): err = ( - f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src_display_name}" and "{dst_display_name}" because' - f' "{src_type}" is not a term or "{dst_type}" is not a term.' + f'Cannot create "{relationship_type.value}" relation between' + f' "{src_display_name}" and "{dst_display_name}" because "{src_type}"' + f' is not a term or "{dst_type}" is not a term.' ) return False, err if src_display_name == dst_display_name and src_type == dst_type: err = ( - f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src_display_name}" and itself.' + f'Cannot create "{relationship_type.value}" relation between' + f' "{src_display_name}" and itself.' ) return False, err if not src_entry: err = ( - f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src_display_name}" and "{dst_display_name}" because' - f' "{src_display_name}" doesn\'t exist in the CSV.' + f'Cannot create "{relationship_type.value}" relation between' + f' "{src_display_name}" and "{dst_display_name}" because' + f' "{src_display_name}" doesn\'t exist in the CSV file.' ) return False, err elif not dst_entry: err = ( - f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src_display_name}" and "{dst_display_name}" because' - f' "{dst_display_name}" doesn\'t exist in the CSV.' + f'Cannot create "{relationship_type.value}" relation between' + f' "{src_display_name}" and "{dst_display_name}" because' + f' "{dst_display_name}" doesn\'t exist in the CSV file.' ) return False, err elif ( @@ -325,8 +325,8 @@ def _is_relationship_valid( and dst_type != entry_type_lib.EntryType.CATEGORY ): err = ( - f'Won\'t be able to create a "{relationship_type.value}" relation' - f' between "{src_display_name}" and "{dst_display_name}" because' + f'Cannot create "{relationship_type.value}" relation between' + f' "{src_display_name}" and "{dst_display_name}" because' f' "{dst_display_name}" is not a category.' ) return False, err @@ -513,7 +513,7 @@ def _create_relationships( return successful_relations, errors logger.info( - f'Adding {relationship_type.value} relations between' + f'Adding {relationship_type.value} relation between' f' {src_type.value} and {dst_type.value} entries...' ) tasks = [ diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py index 60f4596..2a766a1 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/terms_csv_parser.py @@ -55,7 +55,7 @@ def parse_glossary_csv( Returns: _ParserReturnType - a tuple of list of successfully parsed terms, - a list of errors and the number of lines we read in the CSV. + a list of errors and the number of lines we read in the CSV file. """ terms = {} @@ -95,13 +95,13 @@ def _validate_term( """Validates a business glossary term. Performs the following tests: - - The term is unique in the CSV + - The term is unique in the CSV file - Display name is not empty - Description is not empty Args: term: Term - tracked_terms: Set of terms seen so far in the CSV + tracked_terms: Set of terms seen so far in the CSV file Returns: ParseErrors @@ -127,11 +127,11 @@ def _validate_term( errors.append(err) if term.display_name: - # If the term has appeared before in the CSV we record an error. + # If the term has appeared before in the CSV file we record an error. if term.display_name.lower() in tracked_terms: err = error.ParseError( entry_type.EntryType.TERM, - message="The term is duplicated in the CSV.", + message="The term is duplicated in the CSV file.", column=1, resources=[term.display_name], ) @@ -165,8 +165,9 @@ def parse_term( """Parses a business glossary term. Args: - line_idx: Index of the line where the term appears in the CSV. - record: A list of term attributes in order conforming to the CSV schema. + line_idx: Index of the line where the term appears in the CSV file. + record: A list of term attributes in order conforming to the CSV file + schema. tracked_terms: Set of previously seen display names. Returns: diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py index 57030c2..9f870e5 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/categories_csv_parser_test.py @@ -97,7 +97,7 @@ def test_read_glossary_csv_duplicate_errors(self): self.assertEqual(errors[0].line, 2) self.assertEqual(errors[0].column, 1) self.assertEqual( - errors[0].message, "The category is duplicated in the CSV." + errors[0].message, "The category is duplicated in the CSV file." ) self.assertEqual(lines_read, 2) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py index af9b624..c19c287 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/terms_csv_parser_test.py @@ -93,7 +93,9 @@ def test_read_glossary_csv_duplicate_errors(self): self.assertEqual(errors[0].entry_type, entry_type.EntryType.TERM) self.assertEqual(errors[0].line, 2) self.assertEqual(errors[0].column, 1) - self.assertEqual(errors[0].message, "The term is duplicated in the CSV.") + self.assertEqual( + errors[0].message, "The term is duplicated in the CSV file." + ) self.assertEqual(lines_read, 2) def test_read_glossary_csv_empty_display_name(self): diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py index b627df2..91565fb 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py @@ -150,7 +150,7 @@ def _print_statistics( if entry_type in lines_read and entry_type in imported_entries: imported_entries_count = len(imported_entries[entry_type]) parsed_entries_count = lines_read[entry_type] - logger.info(f"Statistics of Imported {entries_name}:") + logger.info(f"Statistics of imported {entries_name}:") logger.info( f"{entries_name.capitalize()} successfully created:" f" {imported_entries_count}/{parsed_entries_count}." diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py index bbe2478..8cc98a2 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/utils.py @@ -21,7 +21,7 @@ def csv_file_exists(path: str) -> bool: """Verifies if the provided file path exists. Args: - path: Path of the CSV provided by the user. + path: Path of the CSV file provided by the user. Returns: Boolean value indicating whether the file exists in the filesystem. @@ -130,14 +130,6 @@ def configure_argument_parser(parser: argparse.ArgumentParser) -> None: " before proceeding with validation and import.\n" ) ) - parser.add_argument( - "--strict-parsing", - help=( - "If set, the program will finish its execution if there are any" - " parsing errors without importing any terms." - ), - action="store_true", - ) def display_parsing_errors(errors: list[error.ParseError]) -> None: @@ -169,10 +161,18 @@ def validate_args(args: argparse.Namespace) -> None: # Verify only one terms csv is provided: if args.terms_csv and args.terms_csv_legacy: logger.error( - "At most one of --terms-csv and terms-csv_legacy should be provided." + "Only one of the following can be provided: --terms-csv or" + " terms_csv-legacy." ) exit(1) + # Warn users when legacy terms csv argument is used. + if args.terms_csv_legacy: + logger.warning( + "Terms CSV file was passed in a legacy way. Terms CSV file should be" + " passed in --terms-csv argument." + ) + _verify_csv_file_existence(args, "terms_csv_legacy") _verify_csv_file_existence(args, "terms_csv", prefix="--") _verify_csv_file_existence(args, "categories_csv", prefix="--") @@ -185,12 +185,12 @@ def _verify_csv_file_existence( Args: args: script run arguments - arg_name: CSV path argument + arg_name: CSV file path argument prefix: argument prefix e.g. for --terms_csv prefix="--" """ file_path = vars(args).get(arg_name) if file_path and not csv_file_exists(file_path): logger.error( - f"The provided {prefix}{arg_name} CSV file path doesn't exist." + f"The CSV file path provided for {prefix}{arg_name} doesn't exist." ) sys.exit(1) From d14610e08f3e1b01fdfa76b5c37246a51bbcf29d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20=C5=81awski?= Date: Mon, 30 Oct 2023 11:19:30 +0000 Subject: [PATCH 4/4] Typo fix. Minor review changes. --- .../scripts/python/business-glossary-import/README.md | 4 ++++ .../business-glossary-import/bg_import/api_call_utils.py | 3 +-- .../python/business-glossary-import/bg_import/glossary.py | 2 +- .../bg_import/tests/glossary_test.py | 6 +++--- .../business-glossary-import/bg_import/user_report.py | 3 +-- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md index 6b760ca..1fd156d 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/README.md @@ -44,6 +44,10 @@ export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) The source CSV files shall adhere to RFC4180 format. +However, currently, we do not allow a header line and all records should be +encoded in UTF-8 charset. We expect values to be separated by comma (`,`) +character. + ### Categories CSV schema Each record in the categories CSV file represents a single category with the diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py index 7b2c5d1..5076418 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/api_call_utils.py @@ -110,8 +110,7 @@ def create_error_message( or extract_error_code(response_err) or base_err_description ) - error_msg = f'{method_name} call to {url} returned: {err_description}' - return error_msg + return f'{method_name} call to {url} returned: {err_description}' def fetch_api_response( diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py index d27368b..cf0c849 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/glossary.py @@ -582,7 +582,7 @@ def import_glossary( logger.error( 'Errors occurred during categories import.%s', error_log_suffix ) - user_report.print_report_for_erronous_categories_import( + user_report.print_report_for_erroneous_categories_import( imported_categories, categories_import_errors ) utils.end_program_execution() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py index 649ed0a..803e29a 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/tests/glossary_test.py @@ -90,9 +90,9 @@ def test_glossary_print_report_and_exit_on_categories_import_error(self): return_value=expected_import_glossary_terms_ret, ) ) - mock_print_report_for_erronous_categories_import = self.enterContext( + mock_print_report_for_erroneous_categories_import = self.enterContext( mock.patch.object( - user_report, "print_report_for_erronous_categories_import" + user_report, "print_report_for_erroneous_categories_import" ) ) mock_end_program_execution = self.enterContext( @@ -104,7 +104,7 @@ def test_glossary_print_report_and_exit_on_categories_import_error(self): glossary.import_glossary(terms, categories) mock_import_glossary_categories.assert_called_once_with(categories) - mock_print_report_for_erronous_categories_import.assert_called_once_with( + mock_print_report_for_erroneous_categories_import.assert_called_once_with( expected_imported_categories, expected_categories_import_errors ) mock_end_program_execution.assert_called_once() diff --git a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py index 91565fb..f83fe4d 100644 --- a/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py +++ b/dataplex-quickstart-labs/00-resources/scripts/python/business-glossary-import/bg_import/user_report.py @@ -13,8 +13,7 @@ ] -# Typo erronous -def print_report_for_erronous_categories_import( +def print_report_for_erroneous_categories_import( imported_categories: list[bg_category.Category], categories_import_errors: list[error.EntryImportError], ) -> None: