Merge branch 'm-kovalsky/vertipaqRC'

microsoft · Sep 16, 2024 · c9fc690 · c9fc690
2 parents ca95559 + 5278afa
commit c9fc690
Show file tree

Hide file tree

Showing 3 changed files with 236 additions and 122 deletions.
diff --git a/src/sempy_labs/_helper_functions.py b/src/sempy_labs/_helper_functions.py
@@ -6,7 +6,6 @@
 from functools import wraps
 import datetime
 import time
-from pyspark.sql import SparkSession
 from typing import Optional, Tuple, List
 from uuid import UUID
 import sempy_labs._icons as icons
@@ -392,6 +391,7 @@ def save_as_delta_table(
     delta_table_name: str,
     write_mode: str,
     merge_schema: Optional[bool] = False,
+    schema: Optional[dict] = None,
     lakehouse: Optional[str] = None,
     workspace: Optional[str] = None,
 ):
@@ -408,20 +408,31 @@ def save_as_delta_table(
         The write mode for the save operation. Options: 'append', 'overwrite'.
     merge_schema : bool, default=False
         Merges the schemas of the dataframe to the delta table.
+    schema : dict, default=None
+        A dictionary showing the schema of the columns and their data types.
     lakehouse : str, default=None
         The Fabric lakehouse used by the Direct Lake semantic model.
         Defaults to None which resolves to the lakehouse attached to the notebook.
     workspace : str, default=None
         The Fabric workspace name.
         Defaults to None which resolves to the workspace of the attached lakehouse
         or if no lakehouse attached, resolves to the workspace of the notebook.
-
-    Returns
-    -------
-    UUID
-        The ID of the Power BI report.
     """
 
+    from pyspark.sql import SparkSession
+    from pyspark.sql.types import (
+        StringType,
+        IntegerType,
+        FloatType,
+        DateType,
+        StructType,
+        StructField,
+        BooleanType,
+        LongType,
+        DoubleType,
+        TimestampType,
+    )
+
     if workspace is None:
         workspace_id = fabric.get_workspace_id()
         workspace = fabric.resolve_workspace_name(workspace_id)
@@ -450,9 +461,32 @@ def save_as_delta_table(
         )
 
     dataframe.columns = dataframe.columns.str.replace(" ", "_")
-
     spark = SparkSession.builder.getOrCreate()
-    spark_df = spark.createDataFrame(dataframe)
+
+    type_mapping = {
+        "string": StringType(),
+        "str": StringType(),
+        "integer": IntegerType(),
+        "int": IntegerType(),
+        "float": FloatType(),
+        "date": DateType(),
+        "bool": BooleanType(),
+        "boolean": BooleanType(),
+        "long": LongType(),
+        "double": DoubleType(),
+        "timestamp": TimestampType(),
+    }
+
+    if schema is None:
+        spark_df = spark.createDataFrame(dataframe)
+    else:
+        schema_map = StructType(
+            [
+                StructField(column_name, type_mapping[data_type], True)
+                for column_name, data_type in schema.items()
+            ]
+        )
+        spark_df = spark.createDataFrame(dataframe, schema_map)
 
     filePath = create_abfss_path(
         lakehouse_id=lakehouse_id,

diff --git a/src/sempy_labs/_list_functions.py b/src/sempy_labs/_list_functions.py
@@ -8,6 +8,7 @@
     pagination,
     lro,
     resolve_item_type,
+    format_dax_object_name,
 )
 import pandas as pd
 import base64
@@ -169,10 +170,19 @@ def list_tables(
                 dataset=dataset,
                 workspace=workspace,
                 dax_string="""
-                SELECT [DIMENSION_NAME],[DIMENSION_CARDINALITY] FROM $SYSTEM.MDSCHEMA_DIMENSIONS
+                SELECT [DIMENSION_NAME],[ROWS_COUNT] FROM $SYSTEM.DISCOVER_STORAGE_TABLES
+                WHERE RIGHT ( LEFT ( TABLE_ID, 2 ), 1 ) <> '$'
             """,
             )
 
+            model_size = (
+                dict_sum.sum()
+                + data_sum.sum()
+                + hier_sum.sum()
+                + rel_sum.sum()
+                + uh_sum.sum()
+            )
+
         rows = []
         for t in tom.model.Tables:
             t_name = t.Name
@@ -209,9 +219,7 @@ def list_tables(
                 new_data.update(
                     {
                         "Row Count": (
-                            rc[rc["DIMENSION_NAME"] == t_name][
-                                "DIMENSION_CARDINALITY"
-                            ].iloc[0]
+                            rc[rc["DIMENSION_NAME"] == t_name]["ROWS_COUNT"].iloc[0]
                             if not rc.empty
                             else 0
                         ),
@@ -221,24 +229,33 @@ def list_tables(
                         "Hierarchy Size": h_size,
                         "Relationship Size": r_size,
                         "User Hierarchy Size": u_size,
+                        "Partitions": int(len(t.Partitions)),
+                        "Columns": sum(
+                            1 for c in t.Columns if str(c.Type) != "RowNumber"
+                        ),
+                        "% DB": round((total_size / model_size) * 100, 2),
                     }
                 )
 
             rows.append(new_data)
 
-        int_cols = [
-            "Row Count",
-            "Total Size",
-            "Dictionary Size",
-            "Data Size",
-            "Hierarchy Size",
-            "Relationship Size",
-            "User Hierarchy Size",
-        ]
-        df[int_cols] = df[int_cols].astype(int)
-
         df = pd.DataFrame(rows)
 
+        if extended:
+            int_cols = [
+                "Row Count",
+                "Total Size",
+                "Dictionary Size",
+                "Data Size",
+                "Hierarchy Size",
+                "Relationship Size",
+                "User Hierarchy Size",
+                "Partitions",
+                "Columns",
+            ]
+            df[int_cols] = df[int_cols].astype(int)
+            df["% DB"] = df["% DB"].astype(float)
+
     return df
 
 
@@ -1274,6 +1291,8 @@ def list_relationships(
     workspace = fabric.resolve_workspace_name(workspace)
 
     dfR = fabric.list_relationships(dataset=dataset, workspace=workspace)
+    dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
+    dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])
 
     if extended:
         # Used to map the Relationship IDs