From e1cd3605966d27313118e07a9b37a7eb7f0c9c93 Mon Sep 17 00:00:00 2001 From: Timo Stamm Date: Mon, 3 Jan 2022 23:07:24 +0100 Subject: [PATCH] Use TextDecoder API for decoding UTF-8 from binary data, see #184 --- CHANGELOG.md | 8 +++ MANUAL.md | 32 +++++++++ packages/benchmarks/perf.ts | 86 +++++++++++++++---------- packages/runtime/src/binary-reader.ts | 15 ++++- packages/runtime/src/binary-writer.ts | 11 +++- packages/runtime/src/protobufjs-utf8.ts | 5 ++ 6 files changed, 117 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 480f6e48..2dc7d054 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ New features: - The new plugin option `add_pb_suffix` adds the suffix `_pb` to all file names, see #186. +Bug fixes: + +- Use TextDecoder API for decoding UTF-8 from binary data, see #184. + We have been using protobuf.js' algorithm to decode UTF-8, but it has had [bugs](https://github.com/protobufjs/protobuf.js/pull/1486) + in the past. For best possible compatibility, we have switched to the TextDecoder API. + See [MANUAL](./MANUAL.md#utf-8-decoding) for details. + + ### v2.1.0 New features: diff --git a/MANUAL.md b/MANUAL.md index 564916f0..261d8a29 100644 --- a/MANUAL.md +++ b/MANUAL.md @@ -1120,6 +1120,38 @@ The `toBinary` method takes an optional second argument of type Allows to use a custom implementation to encode binary data. +#### UTF-8 decoding + +JavaScript uses UTF-16 for strings, but protobuf uses UTF-8. In order +to serialize to and from binary data, protobuf-ts converts between the +encodings with the [TextEncoder / TextDecoder API](https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API). + +Note that the protobuf [language guide](https://developers.google.com/protocol-buffers/docs/proto3#scalar) states: + +> A string must always contain UTF-8 encoded or 7-bit ASCII text [...] + +If an invalid UTF-8 string is encoded in the binary format, protobuf-ts +will raise an error on decoding through the TextDecoder option `fatal`. +If you do not want that behaviour, use the `readerFactory` option to +pass your own TextDecoder instance. + +As of January 2022, performance of TextDecoder on Node.js falls behind +Node.js' `Buffer`. In order to use `Buffer` to decode UTF-8, use the +`readerFactory` option: + +```ts +const nodeBinaryReadOptions = { + readerFactory: (bytes: Uint8Array) => new BinaryReader(bytes, { + decode(input?: Uint8Array): string { + return input ? (input as Buffer).toString("utf8") : ""; + } + }) +}; +MyMessage.fromBinary(bytes, nodeBinaryReadOptions); +``` + + + #### Conformance `protobuf-ts` strictly conforms to the protobuf spec. It passes all diff --git a/packages/benchmarks/perf.ts b/packages/benchmarks/perf.ts index 896169cd..35448b0d 100644 --- a/packages/benchmarks/perf.ts +++ b/packages/benchmarks/perf.ts @@ -4,9 +4,14 @@ import {FileDescriptorSet as tsProtoType} from "./testees/ts-proto.default/.plug import {FileDescriptorSet as googleProtobufType} from "google-protobuf/google/protobuf/descriptor_pb"; import {FileDescriptorSet as sizeType} from "./testees/protobuf-ts.size/.plugin-out/google/protobuf/descriptor"; import {FileDescriptorSet as speedType} from "./testees/protobuf-ts.speed/.plugin-out/google/protobuf/descriptor"; -import {FileDescriptorSet as sizeBigintType} from "./testees/protobuf-ts.size-bigint/.plugin-out/google/protobuf/descriptor"; -import {FileDescriptorSet as speedBigintType} from "./testees/protobuf-ts.speed-bigint/.plugin-out/google/protobuf/descriptor"; +import { + FileDescriptorSet as sizeBigintType +} from "./testees/protobuf-ts.size-bigint/.plugin-out/google/protobuf/descriptor"; +import { + FileDescriptorSet as speedBigintType +} from "./testees/protobuf-ts.speed-bigint/.plugin-out/google/protobuf/descriptor"; import * as protobufjsNamespace from "./testees/protobufjs/.plugin-out/descriptor" +import {BinaryReader} from "@protobuf-ts/runtime"; function bench(name: string, fn: () => void, durationSeconds = 5) { let startTs = performance.now(); @@ -44,50 +49,61 @@ let speedJson = speedType.toJson(speedMessage); let speedBigintMessage = speedBigintType.fromBinary(bytes); let protobufjsMessage = protobufjsType.decode(new Uint8Array(bytes)); let protobufjsJson = protobufjsType.toObject(protobufjsMessage); +const nodeBinaryReadOptions = { + readerFactory: (bytes: Uint8Array) => new BinaryReader(bytes, { + decode(input?: Uint8Array): string { + return input ? (input as Buffer).toString("utf8") : ""; + } + }) +}; console.log('### read binary'); -bench('google-protobuf ', () => googleProtobufType.deserializeBinary(bytes)); -bench('ts-proto ', () => tsProtoType.decode(bytes)); -bench('protobuf-ts (speed) ', () => speedType.fromBinary(bytes)); -bench('protobuf-ts (speed, bigint) ', () => speedBigintType.fromBinary(bytes)); -bench('protobuf-ts (size) ', () => sizeType.fromBinary(bytes)); -bench('protobuf-ts (size, bigint) ', () => sizeBigintType.fromBinary(bytes)); -bench('protobufjs ', () => protobufjsType.decode(new Uint8Array(bytes))); +bench('google-protobuf ', () => googleProtobufType.deserializeBinary(bytes)); +bench('ts-proto ', () => tsProtoType.decode(bytes)); +bench('protobuf-ts (speed) ', () => speedType.fromBinary(bytes)); +bench('protobuf-ts (speed, bigint) ', () => speedBigintType.fromBinary(bytes)); +bench('protobuf-ts (size) ', () => sizeType.fromBinary(bytes)); +bench('protobuf-ts (size, bigint) ', () => sizeBigintType.fromBinary(bytes)); +bench('protobuf-ts (speed, node/Buffer) ', () => speedType.fromBinary(bytes, nodeBinaryReadOptions)); +bench('protobuf-ts (speed, bigint, node/Buffer) ', () => speedBigintType.fromBinary(bytes, nodeBinaryReadOptions)); +bench('protobuf-ts (size, node/Buffer) ', () => sizeType.fromBinary(bytes, nodeBinaryReadOptions)); +bench('protobuf-ts (size, bigint, node/Buffer) ', () => sizeBigintType.fromBinary(bytes, nodeBinaryReadOptions)); +bench('protobufjs ', () => protobufjsType.decode(new Uint8Array(bytes))); console.log('### write binary'); -bench('google-protobuf ', () => googleProtobufMessage.serializeBinary()); -bench('ts-proto ', () => tsProtoType.encode(tsProtoMessage)); -bench('protobuf-ts (speed) ', () => speedType.toBinary(speedMessage)); -bench('protobuf-ts (speed, bigint) ', () => speedBigintType.toBinary(speedBigintMessage)); -bench('protobuf-ts (size) ', () => sizeType.toBinary(sizeMessage)); -bench('protobuf-ts (size, bigint) ', () => sizeBigintType.toBinary(sizeBigintMessage)); -bench('protobufjs ', () => protobufjsType.encode(protobufjsMessage).finish()); +bench('google-protobuf ', () => googleProtobufMessage.serializeBinary()); +bench('ts-proto ', () => tsProtoType.encode(tsProtoMessage)); +bench('protobuf-ts (speed) ', () => speedType.toBinary(speedMessage)); +bench('protobuf-ts (speed, bigint) ', () => speedBigintType.toBinary(speedBigintMessage)); +bench('protobuf-ts (size) ', () => sizeType.toBinary(sizeMessage)); +bench('protobuf-ts (size, bigint) ', () => sizeBigintType.toBinary(sizeBigintMessage)); +bench('protobufjs ', () => protobufjsType.encode(protobufjsMessage).finish()); console.log('### from partial'); -bench('ts-proto ', () => tsProtoType.fromPartial(tsProtoMessage)); -bench('protobuf-ts (speed) ', () => speedType.create(sizeMessage)); -bench('protobuf-ts (size) ', () => sizeType.create(speedMessage)); +bench('ts-proto ', () => tsProtoType.fromPartial(tsProtoMessage)); +bench('protobuf-ts (speed) ', () => speedType.create(sizeMessage)); +bench('protobuf-ts (size) ', () => sizeType.create(speedMessage)); console.log('### read json string'); -bench('ts-proto ', () => tsProtoType.fromJSON(JSON.parse(tsProtoJsonString))); -bench('protobuf-ts (speed) ', () => speedType.fromJsonString(tsProtoJsonString)); -bench('protobuf-ts (size) ', () => sizeType.fromJsonString(tsProtoJsonString)); -bench('protobufjs ', () => protobufjsType.fromObject(JSON.parse(tsProtoJsonString))); +bench('ts-proto ', () => tsProtoType.fromJSON(JSON.parse(tsProtoJsonString))); +bench('protobuf-ts (speed) ', () => speedType.fromJsonString(tsProtoJsonString)); +bench('protobuf-ts (size) ', () => sizeType.fromJsonString(tsProtoJsonString)); +bench('protobufjs ', () => protobufjsType.fromObject(JSON.parse(tsProtoJsonString))); console.log('### write json string'); -bench('ts-proto ', () => JSON.stringify(tsProtoType.toJSON(tsProtoMessage))); -bench('protobuf-ts (speed) ', () => speedType.toJsonString(speedMessage)); -bench('protobuf-ts (size) ', () => sizeType.toJsonString(sizeMessage)); -bench('protobufjs ', () => JSON.stringify(protobufjsType.toObject(protobufjsMessage))); +bench('ts-proto ', () => JSON.stringify(tsProtoType.toJSON(tsProtoMessage))); +bench('protobuf-ts (speed) ', () => speedType.toJsonString(speedMessage)); +bench('protobuf-ts (size) ', () => sizeType.toJsonString(sizeMessage)); +bench('protobufjs ', () => JSON.stringify(protobufjsType.toObject(protobufjsMessage))); console.log('### read json object'); -bench('ts-proto ', () => tsProtoType.fromJSON(tsProtoJson)); -bench('protobuf-ts (speed) ', () => speedType.fromJson(speedJson)); -bench('protobuf-ts (size) ', () => sizeType.fromJson(sizeJson)); -bench('protobufjs ', () => protobufjsType.fromObject(protobufjsJson)); +bench('ts-proto ', () => tsProtoType.fromJSON(tsProtoJson)); +bench('protobuf-ts (speed) ', () => speedType.fromJson(speedJson)); +bench('protobuf-ts (size) ', () => sizeType.fromJson(sizeJson)); +bench('protobufjs ', () => protobufjsType.fromObject(protobufjsJson)); console.log('### write json object'); -bench('ts-proto ', () => tsProtoType.toJSON(tsProtoMessage)); -bench('protobuf-ts (speed) ', () => speedType.toJson(speedMessage)); -bench('protobuf-ts (size) ', () => sizeType.toJson(sizeMessage)); -bench('protobufjs ', () => protobufjsType.toObject(protobufjsMessage)); +bench('ts-proto ', () => tsProtoType.toJSON(tsProtoMessage)); +bench('protobuf-ts (speed) ', () => speedType.toJson(speedMessage)); +bench('protobuf-ts (size) ', () => sizeType.toJson(sizeMessage)); +bench('protobufjs ', () => protobufjsType.toObject(protobufjsMessage)); diff --git a/packages/runtime/src/binary-reader.ts b/packages/runtime/src/binary-reader.ts index 2af6a62f..7c777249 100644 --- a/packages/runtime/src/binary-reader.ts +++ b/packages/runtime/src/binary-reader.ts @@ -1,9 +1,14 @@ import type {IBinaryReader} from "./binary-format-contract"; import {WireType} from "./binary-format-contract"; import {PbLong, PbULong} from "./pb-long"; -import {utf8read} from "./protobufjs-utf8"; import {varint32read, varint64read} from "./goog-varint"; +/** + * TextDecoderLike is the subset of the TextDecoder API required by protobuf-ts. + */ +interface TextDecoderLike { + decode(input?: Uint8Array): string; +} export class BinaryReader implements IBinaryReader { @@ -19,13 +24,17 @@ export class BinaryReader implements IBinaryReader { private readonly buf: Uint8Array; private readonly view: DataView; + private readonly textDecoder: TextDecoderLike; - constructor(buf: Uint8Array) { + constructor(buf: Uint8Array, textDecoder?: TextDecoderLike) { this.buf = buf; this.len = buf.length; this.pos = 0; this.view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength); + this.textDecoder = textDecoder ?? new TextDecoder("utf-8", { + fatal: true + }); } @@ -218,7 +227,7 @@ export class BinaryReader implements IBinaryReader { * Read a `string` field, length-delimited data converted to UTF-8 text. */ string(): string { - return utf8read(this.bytes()); + return this.textDecoder.decode(this.bytes()); } } diff --git a/packages/runtime/src/binary-writer.ts b/packages/runtime/src/binary-writer.ts index 44486bd3..183e7da0 100644 --- a/packages/runtime/src/binary-writer.ts +++ b/packages/runtime/src/binary-writer.ts @@ -5,6 +5,13 @@ import {varint32write, varint64write} from "./goog-varint"; import {assertFloat32, assertInt32, assertUInt32} from "./assert"; +/** + * TextEncoderLike is the subset of the TextEncoder API required by protobuf-ts. + */ +interface TextEncoderLike { + encode(input?: string): Uint8Array; +} + export class BinaryWriter implements IBinaryWriter { @@ -35,10 +42,10 @@ export class BinaryWriter implements IBinaryWriter { /** * Text encoder instance to convert UTF-8 to bytes. */ - private readonly textEncoder: TextEncoder; + private readonly textEncoder: TextEncoderLike; - constructor(textEncoder?: TextEncoder) { + constructor(textEncoder?: TextEncoderLike) { this.textEncoder = textEncoder ?? new TextEncoder(); this.chunks = []; this.buf = []; diff --git a/packages/runtime/src/protobufjs-utf8.ts b/packages/runtime/src/protobufjs-utf8.ts index f11010cd..070852e6 100644 --- a/packages/runtime/src/protobufjs-utf8.ts +++ b/packages/runtime/src/protobufjs-utf8.ts @@ -29,6 +29,11 @@ const fromCharCodes = (chunk: number[]) => String.fromCharCode.apply(String, chunk) /** + * @deprecated This function will no longer be exported with the next major + * release, since protobuf-ts has switch to TextDecoder API. If you need this + * function, please migrate to @protobufjs/utf8. For context, see + * https://github.com/timostamm/protobuf-ts/issues/184 + * * Reads UTF8 bytes as a string. * * See [protobufjs / utf8](https://github.com/protobufjs/protobuf.js/blob/9893e35b854621cce64af4bf6be2cff4fb892796/lib/utf8/index.js#L40)