-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Emit one kafka message per row in a recordbatch
- Loading branch information
Showing
4 changed files
with
101 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,10 @@ authors = [ | |
"Amey Chaugule <[email protected]>", | ||
] | ||
edition = "2021" | ||
homepage = "https://github.com/probably-nothing-labs/df-streams" | ||
homepage = "https://github.com/probably-nothing-labs/denormalized" | ||
license = "Apache-2.0" | ||
readme = "README.md" | ||
repository = "https://github.com/probably-nothing-labs/df-streams" | ||
repository = "https://github.com/probably-nothing-labs/denormalized" | ||
version = "0.1.0" | ||
description = "Embeddable stream processing engine" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use arrow::json::writer::{JsonFormat, Writer}; | ||
use datafusion::arrow::record_batch::RecordBatch; | ||
use datafusion_common::Result; | ||
|
||
pub trait RowEncoder { | ||
fn encode(&self, batch: &RecordBatch) -> Result<Vec<Vec<u8>>>; | ||
} | ||
|
||
#[derive(Debug, Default)] | ||
// Formats json without any characting separating items. | ||
pub struct NoDelimiter {} | ||
impl JsonFormat for NoDelimiter {} | ||
// writes rows as json without any character separating them | ||
type JsonWriter<W> = Writer<W, NoDelimiter>; | ||
|
||
pub struct JsonRowEncoder {} | ||
|
||
impl JsonRowEncoder { | ||
pub fn batch_to_json(&self, batch: &RecordBatch) -> Result<Vec<u8>> { | ||
let buf = Vec::new(); | ||
let mut writer = JsonWriter::new(buf); | ||
writer.write(batch)?; | ||
writer.finish()?; | ||
let buf = writer.into_inner(); | ||
|
||
Ok(buf) | ||
} | ||
} | ||
|
||
impl RowEncoder for JsonRowEncoder { | ||
fn encode(&self, batch: &RecordBatch) -> Result<Vec<Vec<u8>>> { | ||
if batch.num_rows() == 0 { | ||
return Ok(vec![]); | ||
} | ||
|
||
// BufWriter uses a buffer size of 8KB | ||
// We therefore double this and flush once we have more than 8KB | ||
let mut buffer = Vec::with_capacity(batch.num_rows()); | ||
|
||
for i in 0..batch.num_rows() { | ||
let row = batch.slice(i, 1); | ||
buffer.push(self.batch_to_json(&row)?); | ||
} | ||
|
||
Ok(buffer) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::{JsonRowEncoder, RowEncoder}; | ||
|
||
use datafusion::arrow::array::{Int32Array, StringArray}; | ||
use datafusion::arrow::datatypes::{DataType, Field, Schema}; | ||
use datafusion::arrow::record_batch::RecordBatch; | ||
use std::sync::Arc; | ||
|
||
#[test] | ||
fn serialize_record_batch_to_json() { | ||
// define a schema. | ||
let schema = Arc::new(Schema::new(vec![ | ||
Field::new("col1", DataType::Utf8, false), | ||
Field::new("col2", DataType::Int32, false), | ||
])); | ||
|
||
let batch = RecordBatch::try_new( | ||
schema, | ||
vec![ | ||
Arc::new(StringArray::from(vec!["a", "b", "c", "d"])), | ||
Arc::new(Int32Array::from(vec![1, 10, 20, 100])), | ||
], | ||
) | ||
.unwrap(); | ||
|
||
let encoder = JsonRowEncoder {}; | ||
let buf = encoder.encode(&batch).unwrap(); | ||
|
||
let res: Vec<&[u8]> = vec![ | ||
"{\"col1\":\"a\",\"col2\":1}", | ||
"{\"col1\":\"b\",\"col2\":10}", | ||
"{\"col1\":\"c\",\"col2\":20}", | ||
"{\"col1\":\"d\",\"col2\":100}", | ||
] | ||
.iter() | ||
.map(|v| v.as_bytes()) | ||
.collect::<_>(); | ||
|
||
assert_eq!(buf, res); | ||
} | ||
} |