-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
wip: add kafka batch writer and related tests
also: - add WIP marker to pytest
- Loading branch information
Federico Zambelli
committed
Sep 24, 2024
1 parent
34b7669
commit 9c9d7f0
Showing
4 changed files
with
154 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import logging | ||
from typing import Any | ||
|
||
import pytest | ||
from pyspark.sql.functions import monotonically_increasing_id | ||
|
||
from sparkle.config.kafka_config import SchemaFormat | ||
from sparkle.reader.kafka_reader import KafkaReader | ||
from sparkle.reader.schema_registry import SchemaRegistry | ||
from sparkle.writer.kafka_writer import KafkaBatchPublisher | ||
|
||
KAFKA_BROKER_URL = "localhost:9092" | ||
UNIQUE_ID_COLUMN = "id" | ||
|
||
|
||
@pytest.fixture | ||
def kafka_config() -> dict[str, Any]: | ||
"""Fixture that provides Kafka configuration options for testing. | ||
Returns: | ||
dict[str, any]: A dictionary containing Kafka configuration options, | ||
including Kafka bootstrap servers, security protocol, Kafka topic, | ||
and unique identifier column name. | ||
""" | ||
return { | ||
"kafka_options": { | ||
"kafka.bootstrap.servers": KAFKA_BROKER_URL, | ||
"kafka.security.protocol": "PLAINTEXT", | ||
}, | ||
"kafka_topic": "test-kafka-batch-writer-topic", | ||
"unique_identifier_column_name": UNIQUE_ID_COLUMN, | ||
} | ||
|
||
|
||
@pytest.fixture | ||
def mock_schema_registry(mocker): | ||
"""Fixture to create a mock schema registry client.""" | ||
mock = mocker.Mock(spec=SchemaRegistry) | ||
# mock.cached_schema.return_value = ( | ||
# '{"type": "record", "name": "test", "fields":' | ||
# '[{"name": "test", "type": "string"}]}' | ||
# ) | ||
return mock | ||
|
||
|
||
@pytest.mark.wip | ||
def test_kafka_batch_publisher_write(user_dataframe, kafka_config, spark_session, mock_schema_registry): | ||
"""Test the write method of KafkaBatchPublisher by publishing to Kafka.""" | ||
# fmt: off | ||
df = ( | ||
user_dataframe | ||
.orderBy(user_dataframe.columns[0]) | ||
.withColumn(UNIQUE_ID_COLUMN, monotonically_increasing_id().cast("string")) | ||
) | ||
# fmt: on | ||
|
||
publisher = KafkaBatchPublisher( | ||
kafka_options=kafka_config["kafka_options"], | ||
kafka_topic=kafka_config["kafka_topic"], | ||
unique_identifier_column_name=kafka_config["unique_identifier_column_name"], | ||
spark=spark_session, | ||
) | ||
|
||
# TODO: Cleanup this shit 👇 | ||
reader = KafkaReader( | ||
spark=spark_session, | ||
topic=kafka_config["kafka_topic"], | ||
schema_registry=mock_schema_registry, | ||
format_=SchemaFormat.raw, | ||
schema_version="latest", | ||
kafka_spark_options={ | ||
"kafka.bootstrap.servers": KAFKA_BROKER_URL, | ||
"auto.offset.reset": "earliest", | ||
"enable.auto.commit": True, | ||
}, | ||
) | ||
|
||
query = reader.read().writeStream.format("memory").queryName("kafka_test").outputMode("append").start() | ||
|
||
publisher.write(df) | ||
|
||
# NOTE: WHY THE FUCK IT WORK LIKE THIS? Ask shahin | ||
|
||
query.awaitTermination(10) | ||
|
||
df = spark_session.sql("SELECT * FROM kafka_test") | ||
|
||
logging.info(df.schema) | ||
logging.info(df.count()) | ||
res = df.collect() | ||
|
||
logging.info(res) | ||
|
||
assert False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters