-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from DataChefHQ/feature/14-add-kafka-batch-pub…
…lisher feat: add KafkaBatchPublisher
- Loading branch information
Showing
10 changed files
with
355 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,14 @@ | ||
import pytest | ||
from typing import Any | ||
import io | ||
import json | ||
import logging | ||
import os | ||
from pyspark.sql import SparkSession | ||
import shutil | ||
from contextlib import redirect_stdout | ||
from typing import Any | ||
|
||
import pytest | ||
from pyspark.conf import SparkConf | ||
from pyspark.sql import DataFrame, SparkSession | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
|
@@ -45,18 +50,51 @@ def spark_session() -> SparkSession: | |
for key, value in LOCAL_CONFIG.items(): | ||
spark_conf.set(key, str(value)) | ||
|
||
spark_session = ( | ||
SparkSession.builder.master("local[*]") | ||
.appName("LocalTestSparkleApp") | ||
.config(conf=spark_conf) | ||
) | ||
spark_session = SparkSession.builder.master("local[*]").appName("LocalTestSparkleApp").config(conf=spark_conf) | ||
|
||
if ivy_settings_path: | ||
spark_session.config("spark.jars.ivySettings", ivy_settings_path) | ||
|
||
return spark_session.getOrCreate() | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def checkpoint_directory(): | ||
"""Fixture to validate and remove the checkpoint directory after tests. | ||
To avoid test failures due to non-unique directories, the user should add a | ||
subdirectory to this path when using this fixture. | ||
Example: | ||
>>> dir = checkpoint_directory + subdir | ||
""" | ||
checkpoint_dir = "/tmp/checkpoint/" | ||
|
||
yield checkpoint_dir | ||
|
||
# Remove the checkpoint directory if it exists | ||
if os.path.exists(checkpoint_dir): | ||
shutil.rmtree(checkpoint_dir) | ||
logging.info(f"Checkpoint directory {checkpoint_dir} has been removed.") | ||
else: | ||
logging.warning(f"Checkpoint directory {checkpoint_dir} was not found.") | ||
|
||
|
||
@pytest.fixture(scope="session", autouse=True) | ||
def cleanup_logging_handlers(): | ||
"""Fixture to cleanup logging handlers after tests. | ||
Prevents logging errors at the end of the report. | ||
Taken from [here](https://github.com/pytest-dev/pytest/issues/5502#issuecomment-1803676152) | ||
""" | ||
try: | ||
yield | ||
finally: | ||
for handler in logging.root.handlers[:]: | ||
if isinstance(handler, logging.StreamHandler): | ||
logging.root.removeHandler(handler) | ||
|
||
|
||
@pytest.fixture | ||
def user_dataframe(spark_session: SparkSession): | ||
"""Fixture for creating a DataFrame with user data. | ||
|
@@ -71,21 +109,11 @@ def user_dataframe(spark_session: SparkSession): | |
pyspark.sql.DataFrame: A Spark DataFrame with sample user data. | ||
""" | ||
data = [ | ||
{ | ||
"name": "John", | ||
"surname": "Doe", | ||
"phone": "12345", | ||
"email": "[email protected]", | ||
}, | ||
{ | ||
"name": "Jane", | ||
"surname": "Doe", | ||
"phone": "12345", | ||
"email": "[email protected]", | ||
}, | ||
["John", "Doe", "12345", "[email protected]"], | ||
["Jane", "Doe", "12345", "[email protected]"], | ||
] | ||
|
||
return spark_session.createDataFrame(data) | ||
schema = ["name", "surname", "phone", "email"] | ||
return spark_session.createDataFrame(data, schema=schema) | ||
|
||
|
||
@pytest.fixture | ||
|
@@ -127,3 +155,17 @@ def json_to_string(dictionary: dict[str, Any]) -> str: | |
ensure_ascii=True, | ||
separators=(",", ":"), | ||
).replace("\n", "") | ||
|
||
|
||
def log_spark_dataframe(df: DataFrame, *, truncate: bool = False, name: str = "") -> None: | ||
"""Logs the contents of a Spark DataFrame in tabular format. | ||
Useful when Pytest is configured to capture only logs, so `df.show()` won't work. | ||
Example: | ||
>>> log_spark_dataframe(df, name="My DataFrame") | ||
""" | ||
buffer = io.StringIO() | ||
with redirect_stdout(buffer): | ||
df.show(truncate=truncate) | ||
logging.info(f"\n{name}\n{buffer.getvalue()}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.