standardise on UTF-8 when converting strings to/from bytes (#839)

* standardise on UTF-8 when converting strings to/from bytes * use method that works with Java 8 * Update strings.scala
sksamuel · May 19, 2024 · 80b243d · 80b243d
1 parent 62e87b8
commit 80b243d
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 14 deletions.
diff --git a/avro4s-core/src/main/scala/com/sksamuel/avro4s/encoders/strings.scala b/avro4s-core/src/main/scala/com/sksamuel/avro4s/encoders/strings.scala
@@ -7,6 +7,7 @@ import org.apache.avro.generic.GenericData
 import org.apache.avro.util.Utf8
 
 import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
 import java.util.UUID
 
 trait StringEncoders:
@@ -37,13 +38,15 @@ object UTF8StringEncoder extends Encoder[String] :
   * An [[Encoder]] for Strings that encodes as [[ByteBuffer]]s.
   */
 object ByteStringEncoder extends Encoder[String] :
-  override def encode(schema: Schema): String => Any = string => ByteBuffer.wrap(string.getBytes)
+  override def encode(schema: Schema): String => Any = string =>
+    ByteBuffer.wrap(string.getBytes(StandardCharsets.UTF_8))
 
 /**
   * An [[Encoder]] for Strings that encodes as [[GenericFixed]]s.
   */
 object FixedStringEncoder extends Encoder[String] :
   override def encode(schema: Schema): String => Any = string =>
-    if (string.getBytes.length > schema.getFixedSize)
-      throw new Avro4sEncodingException(s"Cannot write string with ${string.getBytes.length} bytes to fixed type of size ${schema.getFixedSize}")
-    GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(string.getBytes).array, schema).asInstanceOf[GenericData.Fixed]
+    val bytes = string.getBytes(StandardCharsets.UTF_8)
+    if (bytes.length > schema.getFixedSize)
+      throw new Avro4sEncodingException(s"Cannot write string with ${bytes.length} bytes to fixed type of size ${schema.getFixedSize}")
+    GenericData.get.createFixed(null, ByteBuffer.allocate(schema.getFixedSize).put(bytes).array, schema).asInstanceOf[GenericData.Fixed]
diff --git a/avro4s-core/src/test/scala/com/sksamuel/avro4s/examples/ReadMeExamples.scala b/avro4s-core/src/test/scala/com/sksamuel/avro4s/examples/ReadMeExamples.scala
@@ -4,6 +4,8 @@ import com.sksamuel.avro4s.AvroSchema
 import org.scalatest.matchers.should.Matchers
 import org.scalatest.wordspec.AnyWordSpec
 
+import java.nio.charset.StandardCharsets
+
 /**
   * Tests created from README examples
   *
@@ -51,7 +53,7 @@ class ReadMeExamples extends AnyWordSpec with Matchers {
 
       json shouldBe ("{\"name\":\"ennio morricone\",\"birthplace\":\"rome\",\"compositions\":[\"legend of 1900\",\"ecstasy of gold\"]}")
 
-      val in = new ByteArrayInputStream(json.getBytes("UTF-8"))
+      val in = new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))
       val schema = AvroSchema[Composer]
       val input = AvroInputStream.json[Composer].from(in).build(schema)
       val result = input.iterator.toSeq

diff --git a/avro4s-core/src/test/scala/com/sksamuel/avro4s/record/decoder/StringDecoderTest.scala b/avro4s-core/src/test/scala/com/sksamuel/avro4s/record/decoder/StringDecoderTest.scala
@@ -7,6 +7,7 @@ import org.scalatest.funsuite.AnyFunSuite
 import org.scalatest.matchers.should.Matchers
 
 import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
 
 class StringDecoderTest extends AnyFunSuite with Matchers {
 
@@ -43,14 +44,14 @@ class StringDecoderTest extends AnyFunSuite with Matchers {
   test("decode from byte buffers to strings") {
     val schema = AvroSchema[FooString]
     val record = new GenericData.Record(schema)
-    record.put("str", ByteBuffer.wrap("hello".getBytes))
+    record.put("str", ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
     Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
   }
 
   test("decode from byte arrays to strings") {
     val schema = AvroSchema[FooString]
     val record = new GenericData.Record(schema)
-    record.put("str", "hello".getBytes)
+    record.put("str", "hello".getBytes(StandardCharsets.UTF_8))
     Decoder[FooString].decode(schema).apply(record) shouldBe FooString("hello")
   }
 }
diff --git a/avro4s-core/src/test/scala/com/sksamuel/avro4s/record/encoder/ByteArrayEncoderTest.scala b/avro4s-core/src/test/scala/com/sksamuel/avro4s/record/encoder/ByteArrayEncoderTest.scala
@@ -1,12 +1,14 @@
 package com.sksamuel.avro4s.record.encoder
 
-import java.nio.ByteBuffer
 import com.sksamuel.avro4s.{AvroSchema, Encoder, SchemaFor}
 import org.apache.avro.SchemaBuilder
 import org.apache.avro.generic.{GenericFixed, GenericRecord}
 import org.scalatest.funsuite.AnyFunSuite
 import org.scalatest.matchers.should.Matchers
 
+import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
+
 class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
 
   test("encode byte arrays as BYTES type") {
@@ -72,7 +74,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
     val schema = SchemaBuilder.fixed("foo").size(7)
     val fixed = Encoder[Array[Byte]]
       .encode(schema)
-      .apply("hello".getBytes)
+      .apply("hello".getBytes(StandardCharsets.UTF_8))
       .asInstanceOf[GenericFixed]
     fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
     fixed.bytes().length shouldBe 7
@@ -82,7 +84,7 @@ class ByteArrayEncoderTest extends AnyFunSuite with Matchers {
     val schema = SchemaBuilder.fixed("foo").size(7)
     val fixed = Encoder[ByteBuffer]
       .encode(schema)
-      .apply(ByteBuffer.wrap("hello".getBytes))
+      .apply(ByteBuffer.wrap("hello".getBytes(StandardCharsets.UTF_8)))
       .asInstanceOf[GenericFixed]
     fixed.bytes().toList shouldBe Seq(104, 101, 108, 108, 111, 0, 0)
     fixed.bytes().length shouldBe 7

diff --git a/...ore/src/test/scala/com/sksamuel/avro4s/streams/output/AvroDataOutputStreamCodecTest.scala b/...ore/src/test/scala/com/sksamuel/avro4s/streams/output/AvroDataOutputStreamCodecTest.scala
@@ -1,6 +1,8 @@
 package com.sksamuel.avro4s.streams.output
 
 import java.io.ByteArrayOutputStream
+import java.nio.charset.StandardCharsets
+
 import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema, Encoder}
 import org.apache.avro.file.CodecFactory
 import org.scalatest.matchers.should.Matchers
@@ -18,24 +20,24 @@ class AvroDataOutputStreamCodecTest extends AnyWordSpec with Matchers {
       val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).build()
       output.write(ennio)
       output.close()
-      new String(baos.toByteArray) should include("birthplace")
-      new String(baos.toByteArray) should include("compositions")
+      baos.toString(StandardCharsets.UTF_8.name()) should include("birthplace")
+      baos.toString(StandardCharsets.UTF_8.name()) should include("compositions")
     }
 
     "include deflate coded in metadata when serialized with deflate" in {
       val baos = new ByteArrayOutputStream()
       val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.deflateCodec(CodecFactory.DEFAULT_DEFLATE_LEVEL)).build()
       output.write(ennio)
       output.close()
-      new String(baos.toByteArray) should include("deflate")
+      baos.toString(StandardCharsets.UTF_8.name()) should include("deflate")
     }
 
     "include bzip2 coded in metadata when serialized with bzip2" in {
       val baos = new ByteArrayOutputStream()
       val output = AvroOutputStream.data[Composer](schema, Encoder[Composer]).to(baos).withCodec(CodecFactory.bzip2Codec).build()
       output.write(ennio)
       output.close()
-      new String(baos.toByteArray) should include("bzip2")
+      baos.toString(StandardCharsets.UTF_8.name()) should include("bzip2")
     }
   }
 }