dyvenia · trymzet · Oct 4, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 20, 2024
diff --git a/src/viadot/orchestration/prefect/flows/__init__.py b/src/viadot/orchestration/prefect/flows/__init__.py
@@ -1,5 +1,6 @@
 """Import flows."""
 
+from .azure_sql_to_adls import azure_sql_to_adls
 from .bigquery_to_adls import bigquery_to_adls
 from .cloud_for_customers_to_adls import cloud_for_customers_to_adls
 from .cloud_for_customers_to_databricks import cloud_for_customers_to_databricks
@@ -34,6 +35,7 @@
 
 
 __all__ = [
+    "azure_sql_to_adls",
     "bigquery_to_adls",
     "cloud_for_customers_to_adls",
     "cloud_for_customers_to_databricks",

diff --git a/src/viadot/orchestration/prefect/flows/azure_sql_to_adls.py b/src/viadot/orchestration/prefect/flows/azure_sql_to_adls.py
@@ -0,0 +1,77 @@
+"""Flows for downloading data from Azure SQL and uploading it to Azure ADLS."""
+
+from typing import Any
+
+from prefect import flow
+from prefect.task_runners import ConcurrentTaskRunner
+
+from viadot.orchestration.prefect.tasks import azure_sql_to_df, df_to_adls
+
+
+@flow(
+    name="Azure SQL extraction to ADLS",
+    description="Extract data from Azure SQL"
+    + " and load it into Azure Data Lake Storage.",
+    retries=1,
+    retry_delay_seconds=60,
+    task_runner=ConcurrentTaskRunner,
+    log_prints=True,
+)
+def azure_sql_to_adls(
+    query: str | None = None,
+    credentials_secret: str | None = None,
+    validate_df_dict: dict[str, Any] | None = None,
+    convert_bytes: bool = False,
+    remove_special_characters: bool | None = None,
+    columns_to_clean: list[str] | None = None,
+    adls_config_key: str | None = None,
+    adls_azure_key_vault_secret: str | None = None,
+    adls_path: str | None = None,
+    adls_path_overwrite: bool = False,
+) -> None:
+    r"""Download data from Azure SQL to a CSV file and uploading it to ADLS.
+
+    Args:
+        query (str): Query to perform on a database. Defaults to None.
+        credentials_secret (str, optional): The name of the Azure Key Vault
+            secret containing a dictionary with database credentials.
+            Defaults to None.
+        validate_df_dict (Dict[str], optional): A dictionary with optional list of
+            tests to verify the output dataframe. If defined, triggers the `validate_df`
+            task from task_utils. Defaults to None.
+        convert_bytes (bool). A boolean value to trigger method df_converts_bytes_to_int
+            It is used to convert bytes data type into int, as pulling data with bytes
+            can lead to malformed data in data frame.
+            Defaults to False.
+        remove_special_characters (str, optional): Call a function that remove
+            special characters like escape symbols. Defaults to None.
+        columns_to_clean (List(str), optional): Select columns to clean, used with
+            remove_special_characters. If None whole data frame will be processed.
+            Defaults to None.
+        adls_config_key (Optional[str], optional): The key in the viadot config holding
+            relevant credentials. Defaults to None.
+        adls_azure_key_vault_secret (Optional[str], optional): The name of the Azure Key
+            Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal
+            credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake.
+            Defaults to None.
+        adls_path (Optional[str], optional): Azure Data Lake destination file path (with
+            file name). Defaults to None.
+        adls_path_overwrite (bool, optional): Whether to overwrite the file in ADLS.
+            Defaults to True.
+    """
+    data_frame = azure_sql_to_df(
+        query=query,
+        credentials_secret=credentials_secret,
+        validate_df_dict=validate_df_dict,
+        convert_bytes=convert_bytes,
+        remove_special_characters=remove_special_characters,
+        columns_to_clean=columns_to_clean,
+    )
+
+    return df_to_adls(
+        df=data_frame,
+        path=adls_path,
+        credentials_secret=adls_azure_key_vault_secret,
+        config_key=adls_config_key,
+        overwrite=adls_path_overwrite,
+    )
diff --git a/src/viadot/orchestration/prefect/tasks/__init__.py b/src/viadot/orchestration/prefect/tasks/__init__.py
@@ -1,6 +1,7 @@
 """Imports."""
 
 from .adls import adls_upload, df_to_adls
+from .azure_sql import azure_sql_to_df
 from .bcp import bcp
 from .bigquery import bigquery_to_df
 from .cloud_for_customers import cloud_for_customers_to_df
@@ -31,6 +32,7 @@
 
 
 __all__ = [
+    "azure_sql_to_df",
     "adls_upload",
     "bcp",
     "clone_repo",

diff --git a/src/viadot/orchestration/prefect/tasks/azure_sql.py b/src/viadot/orchestration/prefect/tasks/azure_sql.py
@@ -0,0 +1,70 @@
+"""Task for downloading data from Azure SQL."""
+
+from typing import Any, Literal
+
+import pandas as pd
+from prefect import task
+
+from viadot.orchestration.prefect.utils import get_credentials
+from viadot.sources import AzureSQL
+from viadot.utils import validate
+
+
+@task(retries=3, retry_delay_seconds=10, timeout_seconds=60 * 60)
+def azure_sql_to_df(
+    query: str | None = None,
+    credentials_secret: str | None = None,
+    validate_df_dict: dict[str, Any] | None = None,
+    convert_bytes: bool = False,
+    remove_special_characters: bool | None = None,
+    columns_to_clean: list[str] | None = None,
+    if_empty: Literal["warn", "skip", "fail"] = "warn",
+) -> pd.DataFrame:
+    r"""Task to download data from Azure SQL.
+
+    Args:
+        query (str): Query to perform on a database. Defaults to None.
+        credentials_secret (str, optional): The name of the Azure Key Vault
+            secret containing a dictionary with database credentials.
+            Defaults to None.
+        validate_df_dict (Dict[str], optional): A dictionary with optional list of
+            tests to verify the output dataframe. If defined, triggers the `validate_df`
+            task from task_utils. Defaults to None.
+        convert_bytes (bool). A boolean value to trigger method df_converts_bytes_to_int
+            It is used to convert bytes data type into int, as pulling data with bytes
+            can lead to malformed data in data frame.
+            Defaults to False.
+        remove_special_characters (str, optional): Call a function that remove
+            special characters like escape symbols. Defaults to None.
+        columns_to_clean (List(str), optional): Select columns to clean, used with
+            remove_special_characters. If None whole data frame will be processed.
+            Defaults to None.
+        if_empty (Literal["warn", "skip", "fail"], optional): What to do if the
+            query returns no data. Defaults to None.
+
+    Raises:
+        ValueError: Raising ValueError if credentials_secret is not provided
+
+    Returns:
+        pd.DataFrame: The response data as a pandas DataFrame.
+    """
+    if not credentials_secret:
+        msg = "`credentials_secret` has to be specified and not empty."
+        raise ValueError(msg)
+
+    credentials = get_credentials(credentials_secret)
+
+    azure_sql = AzureSQL(credentials=credentials)
+
+    df = azure_sql.to_df(
+        query=query,
+        if_empty=if_empty,
+        convert_bytes=convert_bytes,
+        remove_special_characters=remove_special_characters,
+        columns_to_clean=columns_to_clean,
+    )
+
+    if validate_df_dict is not None:
+        validate(df=df, tests=validate_df_dict)
+
+    return df
diff --git a/src/viadot/orchestration/prefect/tasks/task_utils.py b/src/viadot/orchestration/prefect/tasks/task_utils.py
@@ -19,8 +19,8 @@ def dtypes_to_json_task(dtypes_dict: dict[str, Any], local_json_path: str) -> No
         dtypes_dict (dict): Dictionary containing data types.
         local_json_path (str): Path to local json file.
     """
-    with Path(local_json_path).open("w") as fp:
-        json.dump(dtypes_dict, fp)
+    with Path(local_json_path).open("w") as file_path:
+        json.dump(dtypes_dict, file_path)
 
 
 @task
@@ -59,7 +59,7 @@ def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict:
         "Categorical": "VARCHAR(500)",
         "Time": "TIME",
         "Boolean": "VARCHAR(5)",  # Bool is True/False, Microsoft expects 0/1
-        "DateTime": "DATETIMEOFFSET",  # DATETIMEOFFSET is the only timezone-aware dtype in TSQL
+        "DateTime": "DATETIMEOFFSET",  # DATETIMEOFFSET is timezone-aware dtype in TSQL
         "Object": "VARCHAR(500)",
         "EmailAddress": "VARCHAR(50)",
         "File": None,
@@ -73,7 +73,7 @@ def get_sql_dtypes_from_df(df: pd.DataFrame) -> dict:
         "String": "VARCHAR(500)",
         "IPAddress": "VARCHAR(39)",
         "Path": "VARCHAR(255)",
-        "TimeDelta": "VARCHAR(20)",  # datetime.datetime.timedelta; eg. '1 days 11:00:00'
+        "TimeDelta": "VARCHAR(20)",  # datetime.datetime.timedelta; eg.'1 days 11:00:00'
         "URL": "VARCHAR(255)",
         "Count": "INT",
     }
@@ -209,36 +209,3 @@ def union_dfs_task(dfs: list[pd.DataFrame]) -> pd.DataFrame:
             different size of DataFrames NaN values can appear.
     """
     return pd.concat(dfs, ignore_index=True)
-
-
-@task
-def df_clean_column(
-    df: pd.DataFrame, columns_to_clean: list[str] | None = None
-) -> pd.DataFrame:
-    """Remove special characters from a pandas DataFrame.
-
-    Args:
-    df (pd.DataFrame): The DataFrame to clean.
-    columns_to_clean (List[str]): A list of columns to clean. Defaults is None.
-
-    Returns:
-    pd.DataFrame: The cleaned DataFrame
-    """
-    df = df.copy()
-
-    if columns_to_clean is None:
-        df.replace(
-            to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"],
-            value=["", ""],
-            regex=True,
-            inplace=True,
-        )
-    else:
-        for col in columns_to_clean:
-            df[col].replace(
-                to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"],
-                value=["", ""],
-                regex=True,
-                inplace=True,
-            )
-    return df
diff --git a/src/viadot/sources/__init__.py b/src/viadot/sources/__init__.py
@@ -4,6 +4,7 @@
 
 from ._duckdb import DuckDB
 from ._trino import Trino
+from .azure_sql import AzureSQL
 from .bigquery import BigQuery
 from .cloud_for_customers import CloudForCustomers
 from .customer_gauge import CustomerGauge
@@ -24,6 +25,7 @@
 
 
 __all__ = [
+    "AzureSQL",
     "BigQuery",
     "CloudForCustomers",
     "CustomerGauge",