From ec062b6787dbf3a4285cc571014b939e778c0fa3 Mon Sep 17 00:00:00 2001
From: Gabe Lyons <itsgabelyons@gmail.com>
Date: Mon, 7 Feb 2022 14:29:51 -0800
Subject: [PATCH 1/6] feat(glue): make ownership configurable in glue source
 (#4078)

---
 metadata-ingestion/source_docs/glue.md        |  1 +
 .../src/datahub/ingestion/source/aws/glue.py  | 20 ++++++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md
index a4ddedfb50be8..27f5074f9775f 100644
--- a/metadata-ingestion/source_docs/glue.md
+++ b/metadata-ingestion/source_docs/glue.md
@@ -92,6 +92,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | `ignore_unsupported_connectors` |          | `True`       | Whether to ignore unsupported connectors. If disabled, an error will be raised.    |
 | `emit_s3_lineage`               |          | `True`       | Whether to emit S3-to-Glue lineage.                                                |
 | `glue_s3_lineage_direction`     |          | `upstream`   | If `upstream`, S3 is upstream to Glue. If `downstream` S3 is downstream to Glue.   |
+| `extract_owners`                |          | `True`       | When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets.                                                      |
 
 ## Compatibility
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index d91a5f2893408..1491992469983 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -48,6 +48,7 @@
 
 class GlueSourceConfig(AwsSourceConfig):
 
+    extract_owners: Optional[bool] = True
     extract_transforms: Optional[bool] = True
     underlying_platform: Optional[str] = None
     ignore_unsupported_connectors: Optional[bool] = True
@@ -89,6 +90,7 @@ class GlueSource(Source):
 
     def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
         super().__init__(ctx)
+        self.extract_owners = config.extract_owners
         self.source_config = config
         self.report = GlueSourceReport()
         self.glue_client = config.glue_client
@@ -612,7 +614,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
                     yield dataset_wu
 
     def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent:
-        def get_owner() -> OwnershipClass:
+        def get_owner() -> Optional[OwnershipClass]:
             owner = table.get("Owner")
             if owner:
                 owners = [
@@ -621,11 +623,10 @@ def get_owner() -> OwnershipClass:
                         type=OwnershipTypeClass.DATAOWNER,
                     )
                 ]
-            else:
-                owners = []
-            return OwnershipClass(
-                owners=owners,
-            )
+                return OwnershipClass(
+                    owners=owners,
+                )
+            return None
 
         def get_dataset_properties() -> DatasetPropertiesClass:
             return DatasetPropertiesClass(
@@ -680,7 +681,12 @@ def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
         )
 
         dataset_snapshot.aspects.append(Status(removed=False))
-        dataset_snapshot.aspects.append(get_owner())
+
+        if self.extract_owners:
+            optional_owner_aspect = get_owner()
+            if optional_owner_aspect is not None:
+                dataset_snapshot.aspects.append(optional_owner_aspect)
+
         dataset_snapshot.aspects.append(get_dataset_properties())
         dataset_snapshot.aspects.append(get_schema_metadata(self))
 

From 60c17a2ba7fad598bca9483a7539623dfda6e490 Mon Sep 17 00:00:00 2001
From: Dexter Lee <dexter@acryl.io>
Date: Mon, 7 Feb 2022 19:58:16 -0800
Subject: [PATCH 2/6] fix(ingest): datahub-rest - retry on POST for emitter
 (#4081)

Co-authored-by: Shirshanka Das <shirshanka@apache.org>
---
 metadata-ingestion/src/datahub/emitter/rest_emitter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
index ddd9003a610cf..ee966e6bb8954 100644
--- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py
+++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py
@@ -50,7 +50,8 @@ class DatahubRestEmitter:
         503,
         504,
     ]
-    DEFAULT_RETRY_MAX_TIMES = 1
+    DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"]
+    DEFAULT_RETRY_MAX_TIMES = 3
 
     _gms_server: str
     _token: Optional[str]
@@ -58,6 +59,7 @@ class DatahubRestEmitter:
     _connect_timeout_sec: float = DEFAULT_CONNECT_TIMEOUT_SEC
     _read_timeout_sec: float = DEFAULT_READ_TIMEOUT_SEC
     _retry_status_codes: List[int] = DEFAULT_RETRY_STATUS_CODES
+    _retry_methods: List[str] = DEFAULT_RETRY_METHODS
     _retry_max_times: int = DEFAULT_RETRY_MAX_TIMES
 
     def __init__(
@@ -67,6 +69,7 @@ def __init__(
         connect_timeout_sec: Optional[float] = None,
         read_timeout_sec: Optional[float] = None,
         retry_status_codes: Optional[List[int]] = None,
+        retry_methods: Optional[List[str]] = None,
         retry_max_times: Optional[int] = None,
         extra_headers: Optional[Dict[str, str]] = None,
         ca_certificate_path: Optional[str] = None,
@@ -105,6 +108,9 @@ def __init__(
         if retry_status_codes is not None:  # Only if missing. Empty list is allowed
             self._retry_status_codes = retry_status_codes
 
+        if retry_methods is not None:
+            self._retry_methods = retry_methods
+
         if retry_max_times:
             self._retry_max_times = retry_max_times
 
@@ -112,6 +118,7 @@ def __init__(
             total=self._retry_max_times,
             status_forcelist=self._retry_status_codes,
             backoff_factor=2,
+            allowed_methods=self._retry_methods,
         )
 
         adapter = HTTPAdapter(

From 2de29dc623854a35cd808d789ad9ccec22dbfa54 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Tue, 8 Feb 2022 19:09:30 +0100
Subject: [PATCH 3/6] feat(ingest) - bigquery: More verbose and faster lineage
 generation and option to set partition datetime for profiling (#4079)

---
 .../source_docs/sql_profiles.md               | 48 ++++++++++---------
 .../ingestion/source/ge_profiling_config.py   |  2 +
 .../datahub/ingestion/source/sql/bigquery.py  | 33 +++++++++----
 .../ingestion/source/sql/sql_common.py        |  5 +-
 4 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/metadata-ingestion/source_docs/sql_profiles.md b/metadata-ingestion/source_docs/sql_profiles.md
index e59a0d45a3be5..c380e96b6ade1 100644
--- a/metadata-ingestion/source_docs/sql_profiles.md
+++ b/metadata-ingestion/source_docs/sql_profiles.md
@@ -69,30 +69,32 @@ sink:
 
 Note that a `.` is used to denote nested fields in the YAML recipe.
 
-| Field                                              | Required | Default              | Description                                                                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------------- | -------- |----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `profiling.enabled`                                |          | `False`              | Whether profiling should be done.                                                                                                                                                                                                                                                                                                                                  |
-| `profiling.bigquery_temp_table_schema`             |          |               | On bigquery for profiling partitioned tables needs to create temporary views. You have to define a schema where these will be created. Views will be cleaned up after profiler runs. (Great expectation tech details about this [here](https://legacy.docs.greatexpectations.io/en/0.9.0/reference/integrations/bigquery.html#custom-queries-with-sql-datasource). |
-| `profiling.limit`                                  |          |                      | Max number of documents to profile. By default, profiles all documents.                                                                                                                                                                                                                                                                                            |
-| `profiling.offset`                                 |          |                      | Offset in documents to profile. By default, uses no offset.                                                                                                                                                                                                                                                                                                        |
-| `profiling.max_workers`                            |          | `5 * os.cpu_count()` | Number of worker threads to use for profiling. Set to 1 to disable.                                                                                                                                                                                                                                                                                                |
-| `profiling.query_combiner_enabled`      |          | `True`               | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible.                                                                                                                                                              |
-| `profile_pattern.allow`                            |          | `*`                  | List of regex patterns for tables or table columns to profile. Defaults to all.                                                                                                                                                                                                                                                                                    |
-| `profile_pattern.deny`                             |          |                      | List of regex patterns for tables or table columns to not profile. Defaults to none.                                                                                                                                                                                                                                                                               |
-| `profile_pattern.ignoreCase`                       |          | `True`               | Whether to ignore case sensitivity during pattern matching.                                                                                                                                                                                                                                                                                                        |
-| `profiling.turn_off_expensive_profiling_metrics`   |          | False                | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.                                                                                                                                                     |
-| `profiling.max_number_of_fields_to_profile`        |          | `None`               | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.                                                                                                                                                     |
-| `profiling.profile_table_level_only`               |          | False                | Whether to perform profiling at table-level only, or include column-level profiling as well.                                                                                                                                                                                                                                                                       |
-| `profiling.include_field_null_count`               |          | `True`               | Whether to profile for the number of nulls for each column.                                                                                                                                                                                                                                                                                                        |
-| `profiling.include_field_min_value`                |          | `True`               | Whether to profile for the min value of numeric columns.                                                                                                                                                                                                                                                                                                           |
-| `profiling.include_field_max_value`                |          | `True`               | Whether to profile for the max value of numeric columns.                                                                                                                                                                                                                                                                                                           |
-| `profiling.include_field_mean_value`               |          | `True`               | Whether to profile for the mean value of numeric columns.                                                                                                                                                                                                                                                                                                          |
-| `profiling.include_field_median_value`             |          | `True`               | Whether to profile for the median value of numeric columns.                                                                                                                                                                                                                                                                                                        |
-| `profiling.include_field_stddev_value`             |          | `True`               | Whether to profile for the standard deviation of numeric columns.                                                                                                                                                                                                                                                                                                  |
-| `profiling.include_field_quantiles`                |          | `False`              | Whether to profile for the quantiles of numeric columns.                                                                                                                                                                                                                                                                                                           |
+| Field                                                | Required | Default              | Description                                                                                                                                                                                                                                                                                                                                                        |
+|------------------------------------------------------|----------|----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `profiling.enabled`                                  |          | `False`              | Whether profiling should be done.                                                                                                                                                                                                                                                                                                                                  |
+| `profiling.bigquery_temp_table_schema`               |          |                      | On bigquery for profiling partitioned tables needs to create temporary views. You have to define a schema where these will be created. Views will be cleaned up after profiler runs. (Great expectation tech details about this [here](https://legacy.docs.greatexpectations.io/en/0.9.0/reference/integrations/bigquery.html#custom-queries-with-sql-datasource). |
+| `profiling.limit`                                    |          |                      | Max number of documents to profile. By default, profiles all documents.                                                                                                                                                                                                                                                                                            |
+| `profiling.offset`                                   |          |                      | Offset in documents to profile. By default, uses no offset.                                                                                                                                                                                                                                                                                                        |
+| `profiling.max_workers`                              |          | `5 * os.cpu_count()` | Number of worker threads to use for profiling. Set to 1 to disable.                                                                                                                                                                                                                                                                                                |
+| `profiling.query_combiner_enabled`                   |          | `True`               | *This feature is still experimental and can be disabled if it causes issues.* Reduces the total number of queries issued and speeds up profiling by dynamically combining SQL queries where possible.                                                                                                                                                              |
+| `profile_pattern.allow`                              |          | `*`                  | List of regex patterns for tables or table columns to profile. Defaults to all.                                                                                                                                                                                                                                                                                    |
+| `profile_pattern.deny`                               |          |                      | List of regex patterns for tables or table columns to not profile. Defaults to none.                                                                                                                                                                                                                                                                               |
+| `profile_pattern.ignoreCase`                         |          | `True`               | Whether to ignore case sensitivity during pattern matching.                                                                                                                                                                                                                                                                                                        |
+| `profiling.turn_off_expensive_profiling_metrics`     |          | False                | Whether to turn off expensive profiling or not. This turns off profiling for quantiles, distinct_value_frequencies, histogram & sample_values. This also limits maximum number of fields being profiled to 10.                                                                                                                                                     |
+| `profiling.max_number_of_fields_to_profile`          |          | `None`               | A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.                                                                                                                                                     |
+| `profiling.profile_table_level_only`                 |          | False                | Whether to perform profiling at table-level only, or include column-level profiling as well.                                                                                                                                                                                                                                                                       |
+| `profiling.include_field_null_count`                 |          | `True`               | Whether to profile for the number of nulls for each column.                                                                                                                                                                                                                                                                                                        |
+| `profiling.include_field_min_value`                  |          | `True`               | Whether to profile for the min value of numeric columns.                                                                                                                                                                                                                                                                                                           |
+| `profiling.include_field_max_value`                  |          | `True`               | Whether to profile for the max value of numeric columns.                                                                                                                                                                                                                                                                                                           |
+| `profiling.include_field_mean_value`                 |          | `True`               | Whether to profile for the mean value of numeric columns.                                                                                                                                                                                                                                                                                                          |
+| `profiling.include_field_median_value`               |          | `True`               | Whether to profile for the median value of numeric columns.                                                                                                                                                                                                                                                                                                        |
+| `profiling.include_field_stddev_value`               |          | `True`               | Whether to profile for the standard deviation of numeric columns.                                                                                                                                                                                                                                                                                                  |
+| `profiling.include_field_quantiles`                  |          | `False`              | Whether to profile for the quantiles of numeric columns.                                                                                                                                                                                                                                                                                                           |
 | `profiling.include_field_distinct_value_frequencies` |          | `False`              | Whether to profile for distinct value frequencies.                                                                                                                                                                                                                                                                                                                 |
-| `profiling.include_field_histogram`                |          | `False`              | Whether to profile for the histogram for numeric fields.                                                                                                                                                                                                                                                                                                           |
-| `profiling.include_field_sample_values`            |          | `True`               | Whether to profile for the sample values for all columns.                                                                                                                                                                                                                                                                                                          |
+| `profiling.include_field_histogram`                  |          | `False`              | Whether to profile for the histogram for numeric fields.                                                                                                                                                                                                                                                                                                           |
+| `profiling.include_field_sample_values`              |          | `True`               | Whether to profile for the sample values for all columns.                                                                                                                                                                                                                                                                                                          |
+| `profiling.partition_datetime`                       |          |                      | For partitioned datasets profile only the partition which matches the datetime or profile the latest one if not set. Only Bigquery supports this.                                                                                                                                                                                                                  |
+
 ## Compatibility
 
 Coming soon!
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
index 657195916e114..974d622090da3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -1,3 +1,4 @@
+import datetime
 import os
 from typing import Any, Dict, List, Optional
 
@@ -43,6 +44,7 @@ class GEProfilingConfig(ConfigModel):
     catch_exceptions: bool = True
 
     bigquery_temp_table_schema: Optional[str] = None
+    partition_datetime: Optional[datetime.datetime]
 
     @pydantic.root_validator()
     def ensure_field_level_settings_are_normalized(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py
index 7e2afe1306f9c..1cae97f2ad2a9 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py
@@ -69,6 +69,10 @@
         protoPayload.serviceData.jobCompletedEvent.job.jobStatus.state="DONE"
         AND NOT
         protoPayload.serviceData.jobCompletedEvent.job.jobStatus.error.code:*
+        AND
+        protoPayload.serviceData.jobCompletedEvent.job.jobConfiguration.query.destinationTable.datasetId !~ "^_.*"
+        AND
+        protoPayload.serviceData.jobCompletedEvent.job.jobStatistics.referencedTables:*
     )
 )
 AND
@@ -95,7 +99,7 @@
 where
     is_partitioning_column = 'YES'
     -- Filter out special partitions (https://cloud.google.com/bigquery/docs/partitioned-tables#date_timestamp_partitioned_tables)
-    and p.partition_id not in ('__NULL__', '__UNPARTITIONED__')
+    and p.partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__')
     and STORAGE_TIER='ACTIVE'
     and p.table_name= '{table}'
 group by
@@ -370,12 +374,20 @@ def _get_bigquery_log_entries(
             ),
         )
 
-        logger.debug("Start loading log entries from BigQuery")
+        assert self.config.log_page_size is not None
+
+        logger.info("Start loading log entries from BigQuery")
         for client in clients:
-            yield from client.list_entries(
+            entries = client.list_entries(
                 filter_=filter, page_size=self.config.log_page_size
             )
-        logger.debug("finished loading log entries from BigQuery")
+            item = 0
+            for entry in entries:
+                item = item + 1
+                if item % self.config.log_page_size == 0:
+                    logger.info(f"Read {item} entry from log entries")
+                yield entry
+        logger.info(f"Finished loading {item} log entries from BigQuery")
 
     def _get_exported_bigquery_audit_metadata(
         self, bigquery_client: BigQueryClient
@@ -391,7 +403,7 @@ def _get_exported_bigquery_audit_metadata(
         ).strftime(BQ_DATETIME_FORMAT)
 
         for dataset in self.config.bigquery_audit_metadata_datasets:
-            logger.debug(
+            logger.info(
                 f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
             )
 
@@ -418,7 +430,7 @@ def _get_exported_bigquery_audit_metadata(
                 ).format(start_time=start_time, end_time=end_time)
             query_job = bigquery_client.query(query)
 
-            logger.debug(
+            logger.info(
                 f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
             )
 
@@ -542,7 +554,7 @@ def is_latest_shard(self, project_id: str, schema: str, table: str) -> bool:
             return True
 
     def generate_partition_profiler_query(
-        self, schema: str, table: str
+        self, schema: str, table: str, partition_datetime: Optional[datetime.datetime]
     ) -> Tuple[Optional[str], Optional[str]]:
         """
         Method returns partition id if table is partitioned or sharded and generate custom partition query for
@@ -553,12 +565,13 @@ def generate_partition_profiler_query(
         partition = self.get_latest_partition(schema, table)
         if partition:
             partition_ts: Union[datetime.datetime, datetime.date]
-
+            if not partition_datetime:
+                partition_datetime = parser.parse(partition.partition_id)
             logger.debug(f"{table} is partitioned and partition column is {partition}")
             if partition.data_type in ("TIMESTAMP", "DATETIME"):
-                partition_ts = parser.parse(partition.partition_id)
+                partition_ts = partition_datetime
             elif partition.data_type == "DATE":
-                partition_ts = parser.parse(partition.partition_id).date()
+                partition_ts = partition_datetime.date()
             else:
                 logger.warning(f"Not supported partition type {partition.data_type}")
                 return None, None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index efbf6800009f1..ab07b77a037f6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -1,3 +1,4 @@
+import datetime
 import logging
 from abc import abstractmethod
 from dataclasses import dataclass, field
@@ -1071,7 +1072,7 @@ def _get_profiler_instance(self, inspector: Inspector) -> "DatahubGEProfiler":
 
     # Override if needed
     def generate_partition_profiler_query(
-        self, schema: str, table: str
+        self, schema: str, table: str, partition_datetime: Optional[datetime.datetime]
     ) -> Tuple[Optional[str], Optional[str]]:
         return None, None
 
@@ -1111,7 +1112,7 @@ def loop_profiler_requests(
                 continue
 
             (partition, custom_sql) = self.generate_partition_profiler_query(
-                schema, table
+                schema, table, self.config.profiling.partition_datetime
             )
 
             self.report.report_entity_profiled(dataset_name)

From 9d3bc828826e5192ebd176496c07af62334ab2be Mon Sep 17 00:00:00 2001
From: Gabe Lyons <itsgabelyons@gmail.com>
Date: Tue, 8 Feb 2022 11:32:54 -0800
Subject: [PATCH 4/6] make schema tab no longer default for glossary term
 (#4080)

---
 .../src/app/entity/glossaryTerm/GlossaryTermEntity.tsx    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
index 8bf6e580ca1cd..fd3e0a7aaf3f8 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
+++ b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
@@ -60,6 +60,10 @@ export class GlossaryTermEntity implements Entity<GlossaryTerm> {
                 entityType={EntityType.GlossaryTerm}
                 useEntityQuery={useGetGlossaryTermQuery as any}
                 tabs={[
+                    {
+                        name: 'Related Entities',
+                        component: GlossaryRelatedEntity,
+                    },
                     {
                         name: 'Schema',
                         component: SchemaTab,
@@ -73,10 +77,6 @@ export class GlossaryTermEntity implements Entity<GlossaryTerm> {
                                 glossaryTerm?.glossaryTerm?.schemaMetadata !== null,
                         },
                     },
-                    {
-                        name: 'Related Entities',
-                        component: GlossaryRelatedEntity,
-                    },
                     {
                         name: 'Related Terms',
                         component: GlossayRelatedTerms,

From f5a51f0a74492ab672c1f623f4761af30fe3c1c3 Mon Sep 17 00:00:00 2001
From: John Joyce <john@acryl.io>
Date: Tue, 8 Feb 2022 12:17:38 -0800
Subject: [PATCH 5/6] fix(ingest): rest-emitter - fix serialization helper
 conditional (#4090)

hot-fix for rest emission issues in 0.8.25.0 and 0.8.25.1
---
 metadata-ingestion/src/datahub/emitter/serialization_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metadata-ingestion/src/datahub/emitter/serialization_helper.py b/metadata-ingestion/src/datahub/emitter/serialization_helper.py
index 54a2e6cd63af6..5a348ce267b10 100644
--- a/metadata-ingestion/src/datahub/emitter/serialization_helper.py
+++ b/metadata-ingestion/src/datahub/emitter/serialization_helper.py
@@ -7,7 +7,7 @@ def _json_transform(obj: Any, from_pattern: str, to_pattern: str) -> Any:
         if len(obj.keys()) == 1:
             key: str = list(obj.keys())[0]
             value = obj[key]
-            if key.startswith(from_pattern) >= 0:
+            if key.startswith(from_pattern):
                 new_key = key.replace(from_pattern, to_pattern, 1)
                 return {new_key: _json_transform(value, from_pattern, to_pattern)}
 

From 306fe0b5ffe3e59857ca5643136c8b29d80d4d60 Mon Sep 17 00:00:00 2001
From: Gabe Lyons <itsgabelyons@gmail.com>
Date: Tue, 8 Feb 2022 12:27:09 -0800
Subject: [PATCH 6/6] fix(terms): fix removing terms from schema field & add
 cypress tests to cover these flows (#4091)

---
 .../resolvers/mutate/util/LabelUtils.java     |  2 +-
 .../Schema/utils/useTagsAndTermsRenderer.tsx  | 32 ++++++-------
 .../src/app/shared/tags/AddTagTermModal.tsx   |  6 ++-
 .../src/app/shared/tags/TagTermGroup.tsx      |  4 +-
 .../integration/mutations/mutations.js        | 45 +++++++++++++++++++
 5 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/LabelUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/LabelUtils.java
index 69208f0102c72..36776d9eaca0b 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/LabelUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/LabelUtils.java
@@ -66,7 +66,7 @@ public static void removeTermFromTarget(
       }
 
       removeTermIfExists(editableFieldInfo.getGlossaryTerms(), labelUrn);
-      persistAspect(targetUrn, GLOSSARY_TERM_ASPECT_NAME, editableSchemaMetadata, actor, entityService);
+      persistAspect(targetUrn, EDITABLE_SCHEMA_METADATA, editableSchemaMetadata, actor, entityService);
     }
   }
 
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useTagsAndTermsRenderer.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useTagsAndTermsRenderer.tsx
index f40a3c9ecde0f..c2c46be8a6b98 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useTagsAndTermsRenderer.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useTagsAndTermsRenderer.tsx
@@ -19,21 +19,23 @@ export default function useTagsAndTermsRenderer(
         );
 
         return (
-            <TagTermGroup
-                uneditableTags={options.showTags ? tags : null}
-                editableTags={options.showTags ? relevantEditableFieldInfo?.globalTags : null}
-                uneditableGlossaryTerms={options.showTerms ? record.glossaryTerms : null}
-                editableGlossaryTerms={options.showTerms ? relevantEditableFieldInfo?.glossaryTerms : null}
-                canRemove
-                buttonProps={{ size: 'small' }}
-                canAddTag={tagHoveredIndex === `${record.fieldPath}-${rowIndex}` && options.showTags}
-                canAddTerm={tagHoveredIndex === `${record.fieldPath}-${rowIndex}` && options.showTerms}
-                onOpenModal={() => setTagHoveredIndex(undefined)}
-                entityUrn={urn}
-                entityType={EntityType.Dataset}
-                entitySubresource={record.fieldPath}
-                refetch={refetch}
-            />
+            <div data-testid={`schema-field-${record.fieldPath}-${options.showTags ? 'tags' : 'terms'}`}>
+                <TagTermGroup
+                    uneditableTags={options.showTags ? tags : null}
+                    editableTags={options.showTags ? relevantEditableFieldInfo?.globalTags : null}
+                    uneditableGlossaryTerms={options.showTerms ? record.glossaryTerms : null}
+                    editableGlossaryTerms={options.showTerms ? relevantEditableFieldInfo?.glossaryTerms : null}
+                    canRemove
+                    buttonProps={{ size: 'small' }}
+                    canAddTag={tagHoveredIndex === `${record.fieldPath}-${rowIndex}` && options.showTags}
+                    canAddTerm={tagHoveredIndex === `${record.fieldPath}-${rowIndex}` && options.showTerms}
+                    onOpenModal={() => setTagHoveredIndex(undefined)}
+                    entityUrn={urn}
+                    entityType={EntityType.Dataset}
+                    entitySubresource={record.fieldPath}
+                    refetch={refetch}
+                />
+            </div>
         );
     };
     return tagAndTermRender;
diff --git a/datahub-web-react/src/app/shared/tags/AddTagTermModal.tsx b/datahub-web-react/src/app/shared/tags/AddTagTermModal.tsx
index f3a80c7d20ade..472d57d9bd5d4 100644
--- a/datahub-web-react/src/app/shared/tags/AddTagTermModal.tsx
+++ b/datahub-web-react/src/app/shared/tags/AddTagTermModal.tsx
@@ -236,7 +236,11 @@ export default function AddTagTermModal({
                     <Button onClick={onClose} type="text">
                         Cancel
                     </Button>
-                    <Button onClick={onOk} disabled={selectedValue.length === 0 || disableAdd}>
+                    <Button
+                        data-testid="add-tag-term-from-modal-btn"
+                        onClick={onOk}
+                        disabled={selectedValue.length === 0 || disableAdd}
+                    >
                         Add
                     </Button>
                 </>
diff --git a/datahub-web-react/src/app/shared/tags/TagTermGroup.tsx b/datahub-web-react/src/app/shared/tags/TagTermGroup.tsx
index 2a22e9681948d..10a083b47570e 100644
--- a/datahub-web-react/src/app/shared/tags/TagTermGroup.tsx
+++ b/datahub-web-react/src/app/shared/tags/TagTermGroup.tsx
@@ -229,7 +229,7 @@ export default function TagTermGroup({
                     {...buttonProps}
                 >
                     <PlusOutlined />
-                    Add Tag
+                    <span>Add Tag</span>
                 </NoElementButton>
             )}
             {canAddTerm &&
@@ -243,7 +243,7 @@ export default function TagTermGroup({
                         {...buttonProps}
                     >
                         <PlusOutlined />
-                        Add Term
+                        <span>Add Term</span>
                     </NoElementButton>
                 )}
             {showAddModal && !!entityUrn && !!entityType && (
diff --git a/smoke-test/tests/cypress/cypress/integration/mutations/mutations.js b/smoke-test/tests/cypress/cypress/integration/mutations/mutations.js
index 4a5b4eb8924c4..a8ba4afc192d4 100644
--- a/smoke-test/tests/cypress/cypress/integration/mutations/mutations.js
+++ b/smoke-test/tests/cypress/cypress/integration/mutations/mutations.js
@@ -37,4 +37,49 @@ describe('mutations', () => {
 
     cy.deleteUrn('urn:li:tag:CypressTestAddTag')
   });
+
+  it('can add and remove terms from a dataset', () => {
+    cy.login();
+    cy.visit('/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)');
+    cy.contains('cypress_logging_events');
+
+    cy.contains('Add Term').click();
+
+    cy.focused().type('CypressTerm');
+
+    cy.get('.ant-select-item-option-content').within(() => cy.contains('CypressNode.CypressTerm').click({force: true}));
+
+    cy.get('[data-testid="add-tag-term-from-modal-btn"]').click({force: true});
+    cy.get('[data-testid="add-tag-term-from-modal-btn"]').should('not.exist');
+
+    cy.contains('CypressTerm');
+
+    cy.get('a[href="/glossary/urn:li:glossaryTerm:CypressNode.CypressTerm"]').within(() => cy.get('span[aria-label=close]').click());
+    cy.contains('Yes').click();
+
+    cy.contains('CypressTerm').should('not.exist');
+  });
+
+  it('can add and remove terms from a dataset field', () => {
+    cy.login();
+    // make space for the glossary term column
+    cy.viewport(1300, 800)
+    cy.visit('/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)');
+    cy.get('[data-testid="schema-field-event_name-terms"]').trigger('mouseover', {force: true});
+    cy.get('[data-testid="schema-field-event_name-terms"]').within(() => cy.contains('Add Term').click())
+
+    cy.focused().type('CypressTerm');
+
+    cy.get('.ant-select-item-option-content').within(() => cy.contains('CypressNode.CypressTerm').click({force: true}));
+
+    cy.get('[data-testid="add-tag-term-from-modal-btn"]').click({force: true});
+    cy.get('[data-testid="add-tag-term-from-modal-btn"]').should('not.exist');
+
+    cy.contains('CypressTerm');
+
+    cy.get('a[href="/glossary/urn:li:glossaryTerm:CypressNode.CypressTerm"]').within(() => cy.get('span[aria-label=close]').click());
+    cy.contains('Yes').click();
+
+    cy.contains('CypressTerm').should('not.exist');
+  });
 })