Skip to content

Commit

Permalink
Merge branch 'm-kovalsky/vertipaqRC'
Browse files Browse the repository at this point in the history
  • Loading branch information
m-kovalsky committed Sep 16, 2024
2 parents ca95559 + 5278afa commit c9fc690
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 122 deletions.
50 changes: 42 additions & 8 deletions src/sempy_labs/_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from functools import wraps
import datetime
import time
from pyspark.sql import SparkSession
from typing import Optional, Tuple, List
from uuid import UUID
import sempy_labs._icons as icons
Expand Down Expand Up @@ -392,6 +391,7 @@ def save_as_delta_table(
delta_table_name: str,
write_mode: str,
merge_schema: Optional[bool] = False,
schema: Optional[dict] = None,
lakehouse: Optional[str] = None,
workspace: Optional[str] = None,
):
Expand All @@ -408,20 +408,31 @@ def save_as_delta_table(
The write mode for the save operation. Options: 'append', 'overwrite'.
merge_schema : bool, default=False
Merges the schemas of the dataframe to the delta table.
schema : dict, default=None
A dictionary showing the schema of the columns and their data types.
lakehouse : str, default=None
The Fabric lakehouse used by the Direct Lake semantic model.
Defaults to None which resolves to the lakehouse attached to the notebook.
workspace : str, default=None
The Fabric workspace name.
Defaults to None which resolves to the workspace of the attached lakehouse
or if no lakehouse attached, resolves to the workspace of the notebook.
Returns
-------
UUID
The ID of the Power BI report.
"""

from pyspark.sql import SparkSession
from pyspark.sql.types import (
StringType,
IntegerType,
FloatType,
DateType,
StructType,
StructField,
BooleanType,
LongType,
DoubleType,
TimestampType,
)

if workspace is None:
workspace_id = fabric.get_workspace_id()
workspace = fabric.resolve_workspace_name(workspace_id)
Expand Down Expand Up @@ -450,9 +461,32 @@ def save_as_delta_table(
)

dataframe.columns = dataframe.columns.str.replace(" ", "_")

spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(dataframe)

type_mapping = {
"string": StringType(),
"str": StringType(),
"integer": IntegerType(),
"int": IntegerType(),
"float": FloatType(),
"date": DateType(),
"bool": BooleanType(),
"boolean": BooleanType(),
"long": LongType(),
"double": DoubleType(),
"timestamp": TimestampType(),
}

if schema is None:
spark_df = spark.createDataFrame(dataframe)
else:
schema_map = StructType(
[
StructField(column_name, type_mapping[data_type], True)
for column_name, data_type in schema.items()
]
)
spark_df = spark.createDataFrame(dataframe, schema_map)

filePath = create_abfss_path(
lakehouse_id=lakehouse_id,
Expand Down
49 changes: 34 additions & 15 deletions src/sempy_labs/_list_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
pagination,
lro,
resolve_item_type,
format_dax_object_name,
)
import pandas as pd
import base64
Expand Down Expand Up @@ -169,10 +170,19 @@ def list_tables(
dataset=dataset,
workspace=workspace,
dax_string="""
SELECT [DIMENSION_NAME],[DIMENSION_CARDINALITY] FROM $SYSTEM.MDSCHEMA_DIMENSIONS
SELECT [DIMENSION_NAME],[ROWS_COUNT] FROM $SYSTEM.DISCOVER_STORAGE_TABLES
WHERE RIGHT ( LEFT ( TABLE_ID, 2 ), 1 ) <> '$'
""",
)

model_size = (
dict_sum.sum()
+ data_sum.sum()
+ hier_sum.sum()
+ rel_sum.sum()
+ uh_sum.sum()
)

rows = []
for t in tom.model.Tables:
t_name = t.Name
Expand Down Expand Up @@ -209,9 +219,7 @@ def list_tables(
new_data.update(
{
"Row Count": (
rc[rc["DIMENSION_NAME"] == t_name][
"DIMENSION_CARDINALITY"
].iloc[0]
rc[rc["DIMENSION_NAME"] == t_name]["ROWS_COUNT"].iloc[0]
if not rc.empty
else 0
),
Expand All @@ -221,24 +229,33 @@ def list_tables(
"Hierarchy Size": h_size,
"Relationship Size": r_size,
"User Hierarchy Size": u_size,
"Partitions": int(len(t.Partitions)),
"Columns": sum(
1 for c in t.Columns if str(c.Type) != "RowNumber"
),
"% DB": round((total_size / model_size) * 100, 2),
}
)

rows.append(new_data)

int_cols = [
"Row Count",
"Total Size",
"Dictionary Size",
"Data Size",
"Hierarchy Size",
"Relationship Size",
"User Hierarchy Size",
]
df[int_cols] = df[int_cols].astype(int)

df = pd.DataFrame(rows)

if extended:
int_cols = [
"Row Count",
"Total Size",
"Dictionary Size",
"Data Size",
"Hierarchy Size",
"Relationship Size",
"User Hierarchy Size",
"Partitions",
"Columns",
]
df[int_cols] = df[int_cols].astype(int)
df["% DB"] = df["% DB"].astype(float)

return df


Expand Down Expand Up @@ -1274,6 +1291,8 @@ def list_relationships(
workspace = fabric.resolve_workspace_name(workspace)

dfR = fabric.list_relationships(dataset=dataset, workspace=workspace)
dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])

if extended:
# Used to map the Relationship IDs
Expand Down
Loading

0 comments on commit c9fc690

Please sign in to comment.