Skip to content

Commit

Permalink
standardise on UTF-8 when converting strings to/from bytes (#839)
Browse files Browse the repository at this point in the history
* standardise on UTF-8 when converting strings to/from bytes

* use method that works with Java 8

* Update strings.scala
  • Loading branch information
pjfanning authored May 19, 2024
1 parent 62e87b8 commit 80b243d
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.apache.avro.generic.GenericData
import org.apache.avro.util.Utf8

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets
import java.util.UUID

trait StringEncoders:
Expand Down Expand Up @@ -37,13 +38,15 @@ object UTF8StringEncoder extends Encoder[String] :
* An [[Encoder]] for Strings that encodes as [[ByteBuffer]]s.
*/
object ByteStringEncoder extends Encoder[String] :
override def encode(schema: Schema): String => Any = string => ByteBuffer.wrap(string.getBytes)
override def encode(schema: Schema): String => Any = string =>
ByteBuffer.wrap(string.getBytes(StandardCharsets.UTF_8))

/**
* An [[Encoder]] for Strings that encodes as [[GenericFixed]]s.
*/
object FixedStringEncoder extends Encoder[String] :
override def encode(schema: Schema): String => Any = string =>
if (string.getBytes.length > schema.getFixedSize)
throw new Avro4sEncodingException(s"Cannot write string with ${string.getBytes.length} bytes to fixed type of size ${schema.getFixedSize}")
GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(string.getBytes).array, schema).asInstanceOf[GenericData.Fixed]
val bytes = string.getBytes(StandardCharsets.UTF_8)
if (bytes.length > schema.getFixedSize)
throw new Avro4sEncodingException(s"Cannot write string with ${bytes.length} bytes to fixed type of size ${schema.getFixedSize}")
GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(bytes).array, schema).asInstanceOf[GenericData.Fixed]
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import com.sksamuel.avro4s.AvroSchema
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import java.nio.charset.StandardCharsets

/**
* Tests created from README examples
*
Expand Down Expand Up @@ -51,7 +53,7 @@ class ReadMeExamples extends AnyWordSpec with Matchers {

json shouldBe ("{\"name\":\"ennio morricone\",\"birthplace\":\"rome\",\"compositions\":[\"legend of 1900\",\"ecstasy of gold\"]}")

val in = new ByteArrayInputStream(json.getBytes("UTF-8"))
val in = new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))
val schema = AvroSchema[Composer]
val input = AvroInputStream.json[Composer].from(in).build(schema)
val result = input.iterator.toSeq
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

class StringDecoderTest extends AnyFunSuite with Matchers {

Expand Down Expand Up @@ -43,14 +44,14 @@ class StringDecoderTest extends AnyFunSuite with Matchers {
test("decode from byte buffers to strings") {
val schema = AvroSchema[FooString]
val record = new GenericData.Record(schema)
record.put("str", ByteBuffer.wrap("hello".getBytes))
record.put("str", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
}

test("decode from byte arrays to strings") {
val schema = AvroSchema[FooString]
val record = new GenericData.Record(schema)
record.put("str", "hello".getBytes)
record.put("str", "hello".getBytes(StandardCharsets.UTF_8))
Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package com.sksamuel.avro4s.record.encoder

import java.nio.ByteBuffer
import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor}
import org.apache.avro.SchemaBuilder
import org.apache.avro.generic.{GenericFixed, GenericRecord}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

import java.nio.ByteBuffer
import java.nio.charset.StandardCharsets

class ByteArrayEncoderTest extends AnyFunSuite with Matchers {

test("encode byte arrays as BYTES type") {
Expand Down Expand Up @@ -72,7 +74,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
val schema = SchemaBuilder.fixed("foo").size(7)
val fixed = Encoder[Array[Byte]]
.encode(schema)
.apply("hello".getBytes)
.apply("hello".getBytes(StandardCharsets.UTF_8))
.asInstanceOf[GenericFixed]
fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
fixed.bytes().length shouldBe 7
Expand All @@ -82,7 +84,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
val schema = SchemaBuilder.fixed("foo").size(7)
val fixed = Encoder[ByteBuffer]
.encode(schema)
.apply(ByteBuffer.wrap("hello".getBytes))
.apply(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
.asInstanceOf[GenericFixed]
fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
fixed.bytes().length shouldBe 7
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.sksamuel.avro4s.streams.output

import java.io.ByteArrayOutputStream
import java.nio.charset.StandardCharsets

import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema, Encoder}
import org.apache.avro.file.CodecFactory
import org.scalatest.matchers.should.Matchers
Expand All @@ -18,24 +20,24 @@ class AvroDataOutputStreamCodecTest extends AnyWordSpec with Matchers {
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("birthplace")
new String(baos.toByteArray) should include("compositions")
baos.toString(StandardCharsets.UTF_8.name()) should include("birthplace")
baos.toString(StandardCharsets.UTF_8.name()) should include("compositions")
}

"include deflate coded in metadata when serialized with deflate" in {
val baos = new ByteArrayOutputStream()
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.deflateCodec(CodecFactory.DEFAULT_DEFLATE_LEVEL)).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("deflate")
baos.toString(StandardCharsets.UTF_8.name()) should include("deflate")
}

"include bzip2 coded in metadata when serialized with bzip2" in {
val baos = new ByteArrayOutputStream()
val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.bzip2Codec).build()
output.write(ennio)
output.close()
new String(baos.toByteArray) should include("bzip2")
baos.toString(StandardCharsets.UTF_8.name()) should include("bzip2")
}
}
}

0 comments on commit 80b243d

Please sign in to comment.