Skip to content

Commit

Permalink
Merge pull request #45 from GoogleCloudPlatform/patryklawski_bg_impor…
Browse files Browse the repository at this point in the history
…t_extension_for_categories

Add Categories import to Business Glossary import functionality
  • Loading branch information
anagha-google authored Nov 7, 2023
2 parents bbc642d + a8f92f0 commit 90aaf5d
Show file tree
Hide file tree
Showing 27 changed files with 3,374 additions and 963 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"Crime Metadata","Aggregates categories related to crime such us Location Metadata, Calendar Data, IUCR Data and FBI Data categories, also aggregates some generic terms for crime e.g. case number.","Don Joe<[email protected]>, John Doe<[email protected]>",,
"Location Data","Aggregates terms related to crime location such as coordinates, address, ward, district, beat and location descritption.","Don Joe<[email protected]>, John Doe<[email protected]>","Crime Metadata",
"Calendar Data","Aggregates terms related to crime date such as year or exact date.","Don Joe<[email protected]>, John Doe<[email protected]>","Crime Metadata",
"IUCR Data","Aggregates terms related to Illinois Uniform Crim Reporting such as IUCR, primary type or description.","Don Joe<[email protected]>, John Doe<[email protected]>","Crime Metadata",
"FBI Data","Aggregates terms related to FBI - e.g. FBI Code',"Don Joe<[email protected]>, John Doe<[email protected]>","Crime Metadata",
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
"ID", "Unique identifier for the record.",,,,
"Case Number", "The Chicago Police Department RD Number (Records Division Number), which is unique to the incident.",,,,
"Date", "Date when the incident occurred. this is sometimes a best estimate.",,,,
"Block", "The partially redacted address where the incident occurred, placing it on the same block as the actual address.",,,,
"IUCR", "The Illinois Unifrom Crime Reporting code. This is directly linked to the Primary Type and Description. See the list of IUCR codes at https://data.cityofchicago.org/d/c7ck-438e.",,,,
"Primary Type", "The primary description of the IUCR code.",,,,"IUCR"
"Description", "The secondary description of the IUCR code, a subcategory of the primary description.",,,,"IUCR"
"Location Description", "Description of the location where the incident occurred.",,,,
"Arrest", "Indicates whether an arrest was made.",,,,
"Domestic", "Indicates whether the incident was domestic-related as defined by the Illinois Domestic Violence Act.",,,,
"Beat", "Indicates the beat where the incident occurred. A beat is the smallest police geographic area – each beat has a dedicated police beat car. Three to five beats make up a police sector, and three sectors make up a police district. The Chicago Police Department has 22 police districts. See the beats at https://data.cityofchicago.org/d/aerh-rz74.",,,,
"District", "Indicates the police district where the incident occurred. See the districts at https://data.cityofchicago.org/d/fthy-xz3r.",,,,"Beat"
"Ward", "The ward (City Council district) where the incident occurred. See the wards at https://data.cityofchicago.org/d/sp34-6z76.",,,,"Beat, District"
"Community Area", "Indicates the community area where the incident occurred. Chicago has 77 community areas. See the community areas at https://data.cityofchicago.org/d/cauq-8yn6.",,,,"Beat, District, Ward"
"FBI Code", "Indicates the crime classification as outlined in the FBI's National Incident-Based Reporting System (NIBRS). See the Chicago Police Department listing of these classifications at http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.",,,,
"X Coordinate", "The x coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,
"Y Coordinate", "The y coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"X Coordinate"
"Year", "Year the incident occurred.",,,,
"Updated On", "Date and time the record was last updated.",,,,
"Latitude", "The latitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,
"Longitude", "The longitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"Latitude"
"Location", "The location where the incident occurred in a format that allows for creation of maps and other geographic operations on this data portal. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,
"ID", "Unique identifier for the record.",,,,,"Crime Metadata",
"Case Number", "The Chicago Police Department RD Number (Records Division Number), which is unique to the incident.",,,,,"Crime Metadata",
"Date", "Date when the incident occurred. this is sometimes a best estimate.",,,,,"Calendar Data",
"Block", "The partially redacted address where the incident occurred, placing it on the same block as the actual address.",,,,,"Location Data",
"IUCR", "The Illinois Unifrom Crime Reporting code. This is directly linked to the Primary Type and Description. See the list of IUCR codes at https://data.cityofchicago.org/d/c7ck-438e.",,,,,"IUCR Data",
"Primary Type", "The primary description of the IUCR code.",,,,"IUCR","IUCR Data",
"Description", "The secondary description of the IUCR code, a subcategory of the primary description.",,,,"IUCR","IUCR Data",
"Location Description", "Description of the location where the incident occurred.",,,,,"Location Data",
"Arrest", "Indicates whether an arrest was made.",,,,,"Crime Metadata",
"Domestic", "Indicates whether the incident was domestic-related as defined by the Illinois Domestic Violence Act.",,,,,"Crime Metadata",
"Beat", "Indicates the beat where the incident occurred. A beat is the smallest police geographic area – each beat has a dedicated police beat car. Three to five beats make up a police sector, and three sectors make up a police district. The Chicago Police Department has 22 police districts. See the beats at https://data.cityofchicago.org/d/aerh-rz74.",,,,,"Location Data",
"District", "Indicates the police district where the incident occurred. See the districts at https://data.cityofchicago.org/d/fthy-xz3r.",,,,"Beat","Location Data",
"Ward", "The ward (City Council district) where the incident occurred. See the wards at https://data.cityofchicago.org/d/sp34-6z76.",,,,"Beat, District","Location Data",
"Community Area", "Indicates the community area where the incident occurred. Chicago has 77 community areas. See the community areas at https://data.cityofchicago.org/d/cauq-8yn6.",,,,"Beat, District, Ward","Location Data",
"FBI Code", "Indicates the crime classification as outlined in the FBI's National Incident-Based Reporting System (NIBRS). See the Chicago Police Department listing of these classifications at http://gis.chicagopolice.org/clearmap_crime_sums/crime_types.",,,,,"FBI Data",
"X Coordinate", "The x coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data",
"Y Coordinate", "The y coordinate of the location where the incident occurred in State Plane Illinois East NAD 1983 projection. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"X Coordinate","Location Data",
"Year", "Year the incident occurred.",,,,,"Calendar Data",
"Updated On", "Date and time the record was last updated.",,,,,"Crime Metadata",
"Latitude", "The latitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data",
"Longitude", "The longitude of the location where the incident occurred. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,"Latitude","Location Data",
"Location", "The location where the incident occurred in a format that allows for creation of maps and other geographic operations on this data portal. This location is shifted from the actual location for partial redaction but falls on the same block.",,,,,"Location Data",
Original file line number Diff line number Diff line change
@@ -1,28 +1,35 @@
# Overview

`bg_import` is a utility that performs bulk import of terms into a Data Catalog business
glossary from a CSV file. To achieve that the CSV file is parsed and validated.
The resulting list of terms is then added into the target glossary via Data
Catalog API. If any errors occur at any stage of the process then an error
report is printed and import continues or completely stops depending on input flags.
`bg_import` is a utility that performs bulk import of categories and terms into
a Data Catalog business glossary from CSV files. To achieve that, the CSV files - one for
categories and one for terms - are parsed and validated. The resulting list of
categories and terms are then added into the target glossary via Data Catalog
API. If any errors occur at any stage of the process then an error report is
printed and import continues or completely stops depending on input flags.

Business Glossary API is currently on private preview, and it needs to be
enabled on the project for it to be used.

## Usage

```
python3 bg_import/business_glossary_import.py <csv file>
python3 bg_import/business_glossary_import.py <terms csv file legacy>
--project=<project_id>
--group=<entry_group_id>
--glossary=<glossary_id>
--location=<location_code>
[--import-mode={strict,clear}]
[--strict-parsing]
[--categories-csv=<categories csv file>]
[--terms-csv={terms csv file}]
[-h]
```

Run `python3 bg_import/business_glossary_import.py -h` for description of individual arguments.
Currently `strict` and `clear` import mode are supported. The default
mode is `strict`. \
Provide a terms CSV file by using `--terms-csv` argument, `terms csv file legacy`
is deprecated. \
Run `python3 bg_import/business_glossary_import.py -h` for description of
individual arguments.

### Access token

Expand All @@ -35,10 +42,36 @@ export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token)

## CSV file schema

The source CSV file shall adhere to RFC4180 format. Each record in the file
represents a single term with the following values:
The source CSV files shall adhere to RFC4180 format.

`term_display_name,description,steward,tagged_assets,synonyms,related_terms`
However, currently, we do not allow a header line and all records should be
encoded in UTF-8 charset. We expect values to be separated by comma (`,`)
character.

### Categories CSV schema

Each record in the categories CSV file represents a single category with the
following values:

`category_display_name,description,steward,belongs_to_category`

Where:

* `category_display_name` (required): Unique name for the entry category.
* `description` (required): Plain text or rich text encoded as plain text
description for the category.
* `steward` (optional): List of data stewards for the current category, with
each steward separated by a comma (`,`). E.g.: `Data
Steward1<[email protected]>, Data teward2<[email protected]>"`
* `belongs_to_category` (optional): Display name of a category to which the
category belongs

### Terms CSV schema

Each record in the terms CSV file represents a single category with the
following values:

`term_display_name,description,steward,tagged_assets,synonyms,related_terms,belongs_to_category`

Where:

Expand All @@ -51,12 +84,14 @@ Where:
* `tagged_assets` (optional): List of asset names for assets explained by the
current term, with each asset separated by a comma (`,`). If a specific
field of the asset needs to be explained by the current term, and not the
asset as a whole, the field can be indicated by separating it from the
asset name with a colon (:) eg. `asset_name:field`
asset as a whole, the field can be indicated by separating it from the asset
name with a colon (:) eg. `asset_name:field`
* `synonyms` (optional): List of terms that have a synonym relation with the
current term, with each term separated by a comma (`,`)
* `related_terms` (optional): List of terms that have a related-to relation
with the current term, with each term separated by a comma (`,`)
* `belongs_to_category` (optional): Display name of a category to which the
term belongs

In the case where a list of items inside a field contains the delimiter value
comma (,) the field has to be escaped by using double quotes (" "). e.g. term 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import logging_utils
import requests


logging.getLogger('urllib3').setLevel(logging.WARNING)
logger = logging_utils.get_logger()

Expand All @@ -20,6 +19,100 @@ def _get_header(project_id: str) -> dict[str, str]:
}


def extract_error_details(
response_err: requests.exceptions.RequestException,
) -> list[Any]:
"""Extract error details from response error data.
Args:
response_err: RequestException containing response error data.
Returns:
List of error details dictionaries.
"""
if response_err.response is None:
return []

try:
data = response_err.response.json()
return data.get('error', dict()).get('details', [])
except requests.exceptions.JSONDecodeError:
return []


def extract_debug_info_detail(
response_err: requests.exceptions.RequestException,
) -> str | None:
"""Extract debug info details from response error data.
Return None if no debug info detail found.
Args:
response_err: RequestException containing response error data.
Returns:
String representing debug infor detail or None if no debug info detail
found.
"""
for detail in extract_error_details(response_err):
if (
detail.get('@type') == 'type.googleapis.com/google.rpc.DebugInfo'
and detail.get('detail') is not None
and not str(detail.get('detail')).isspace()
):
return str(detail.get('detail'))
return None


def extract_error_code(
response_err: requests.exceptions.RequestException,
) -> str | None:
"""Extract error code from response error data.
Return None if no error code found.
Args:
response_err: RequestException containing response error data.
Returns:
String representing error code or None if no error code found.
"""
for detail in extract_error_details(response_err):
if (
detail.get('@type') == 'type.googleapis.com/google.rpc.ErrorInfo'
and detail.get('metadata') is not None
and detail.get('metadata').get('code') is not None
and not str(detail.get('metadata').get('code')).isspace()
):
return str(detail.get('metadata').get('code'))

return None


def create_error_message(
method_name: str,
url: str,
response_err: requests.exceptions.RequestException,
) -> str:
"""Create an error message.
Args:
method_name: String containing a http method name.
url: String containing targeted url.
response_err: RequestException containing response error data.
Returns:
String containing user friendly error message.
"""
base_err_description = str(response_err)
err_description = (
extract_debug_info_detail(response_err)
or extract_error_code(response_err)
or base_err_description
)
return f'{method_name} call to {url} returned: {err_description}'


def fetch_api_response(
method: Callable[..., Any],
url: str,
Expand All @@ -43,7 +136,7 @@ def fetch_api_response(
res = method(url, headers=_get_header(project_id), json=request_body)
res.raise_for_status()
except requests.exceptions.RequestException as err:
error_msg = f'{method_name} call to {url} returned: {err}'
error_msg = create_error_message(method_name, url, err)
return {
'json': data,
'error_msg': error_msg
Expand Down
Loading

0 comments on commit 90aaf5d

Please sign in to comment.