Merge branch 'datahub-project:master' into master

acryldata · Jul 19, 2024 · ac2812d · ac2812d
2 parents 89b03c5 + 4fe5f28
commit ac2812d
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 11 deletions.
diff --git a/metadata-ingestion/docs/sources/looker/lookml_post.md b/metadata-ingestion/docs/sources/looker/lookml_post.md
@@ -1,14 +1,11 @@
 #### Configuration Notes
 
-:::note
-
-The integration can use an SQL parser to try to parse the tables the views depends on.
-
-:::
-
-This parsing is disabled by default, but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package.
-As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser`
-and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser.
+1. If a view contains a liquid template (e.g. `sql_table_name: {{ user_attributes['db']}}.kafka_streaming.events }}`, with `db=ANALYTICS_PROD`), then you will need to specify the values of those variables in the `liquid_variable` config as shown below:
+    ```yml
+      liquid_variable:
+        user_attributes:
+          db: ANALYTICS_PROD
+    ```
 
 ### Multi-Project LookML (Advanced)
 

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -263,6 +263,12 @@
     "azure-identity>=1.14.0",
     "azure-storage-blob>=12.19.0",
     "azure-storage-file-datalake>=12.14.0",
+    "more-itertools>=8.12.0",
+    "pyarrow>=6.0.1",
+    "smart-open[azure]>=5.2.1",
+    "tableschema>=1.20.2",
+    "ujson>=5.2.0",
+    *path_spec_common,
 }
 
 data_lake_profiling = {
@@ -352,6 +358,10 @@
         "feast>=0.34.0,<1",
         "flask-openid>=1.3.0",
         "dask[dataframe]<2024.7.0",
+        # We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
+        # with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
+        # https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
+        "numpy<2",
     },
     "grafana": {"requests"},
     "glue": aws_common,
@@ -415,7 +425,7 @@
     | {"cachetools"},
     "s3": {*s3_base, *data_lake_profiling},
     "gcs": {*s3_base, *data_lake_profiling},
-    "abs": {*abs_base},
+    "abs": {*abs_base, *data_lake_profiling},
     "sagemaker": aws_common,
     "salesforce": {"simple-salesforce"},
     "snowflake": snowflake_common | usage_common | sqlglot_lib,
@@ -539,6 +549,7 @@
     *list(
         dependency
         for plugin in [
+            "abs",
             "athena",
             "bigquery",
             "clickhouse",
@@ -627,6 +638,7 @@
 entry_points = {
     "console_scripts": ["datahub = datahub.entrypoints:main"],
     "datahub.ingestion.source.plugins": [
+        "abs = datahub.ingestion.source.abs.source:ABSSource",
         "csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource",
         "file = datahub.ingestion.source.file:GenericFileSource",
         "datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource",
@@ -695,7 +707,6 @@
         "demo-data = datahub.ingestion.source.demo_data.DemoDataSource",
         "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
         "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource",
-        "abs = datahub.ingestion.source.abs.source:ABSSource",
         "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource",
         "fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource",
         "qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -134,6 +134,12 @@ class BigQuerySchemaGenerator:
         "BIGINT": NumberType,
         "TINYINT": NumberType,
         "BYTEINT": NumberType,
+        "BIGNUMERIC": NumberType,
+        "NUMERIC": NumberType,
+        "DECIMAL": NumberType,
+        "BIGDECIMAL": NumberType,
+        "FLOAT64": NumberType,
+        "RANGE": NullType,
         "STRING": StringType,
         "TIME": TimeType,
         "TIMESTAMP": TimeType,