diff --git a/src/lazy/binary/raw/v1_1/immutable_buffer.rs b/src/lazy/binary/raw/v1_1/immutable_buffer.rs index c2dc99d2..6e20a587 100644 --- a/src/lazy/binary/raw/v1_1/immutable_buffer.rs +++ b/src/lazy/binary/raw/v1_1/immutable_buffer.rs @@ -619,9 +619,13 @@ impl<'a> ImmutableBuffer<'a> { opcode: Opcode, ) -> ParseResult<'a, LazyRawBinaryValue_1_1<'a>> { let input = self; - let header = opcode - .to_header() - .ok_or_else(|| IonError::decoding_error("found a non-value in value position .."))?; + let header = opcode.to_header().ok_or_else(|| { + IonError::decoding_error(format!( + "found a non-value in value position; buffer=<{:X?}>", + input.bytes_range(0, 16.min(input.bytes().len())) + )) + })?; + let header_offset = input.offset(); let (total_length, length_length, value_body_length, delimited_contents) = if opcode.is_delimited_start() { let (contents, after) = input.peek_delimited_container(opcode)?; diff --git a/src/lazy/encoder/binary/v1_1/container_writers.rs b/src/lazy/encoder/binary/v1_1/container_writers.rs index 708e1dd6..4def4c95 100644 --- a/src/lazy/encoder/binary/v1_1/container_writers.rs +++ b/src/lazy/encoder/binary/v1_1/container_writers.rs @@ -5,6 +5,7 @@ use crate::lazy::encoder::binary::v1_1::value_writer::BinaryValueWriter_1_1; use crate::lazy::encoder::binary::v1_1::{flex_sym::FlexSym, flex_uint::FlexUInt}; use crate::lazy::encoder::value_writer::internal::{FieldEncoder, MakeValueWriter}; use crate::lazy::encoder::value_writer::{EExpWriter, SequenceWriter, StructWriter}; +use crate::lazy::encoder::value_writer_config::ValueWriterConfig; use crate::lazy::encoder::write_as_ion::WriteAsIon; use crate::raw_symbol_ref::AsRawSymbolRef; use crate::{IonResult, UInt}; @@ -22,6 +23,7 @@ pub(crate) struct BinaryContainerWriter_1_1<'value, 'top> { // An allocator reference that can be shared with nested container writers allocator: &'top BumpAllocator, encoder: ContainerEncodingKind<'value, 'top>, + write_options: ValueWriterConfig, } enum ContainerEncodingKind<'value, 'top> { @@ -57,13 +59,18 @@ impl<'value, 'top> BinaryContainerWriter_1_1<'value, 'top> { start_opcode: u8, allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + write_options: ValueWriterConfig, ) -> Self { buffer.push(start_opcode); let encoder = ContainerEncodingKind::Delimited(DelimitedEncoder { start_opcode, buffer, }); - Self { allocator, encoder } + Self { + allocator, + encoder, + write_options, + } } pub fn new_length_prefixed( @@ -71,6 +78,7 @@ impl<'value, 'top> BinaryContainerWriter_1_1<'value, 'top> { flex_len_type_code: u8, allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + write_options: ValueWriterConfig, ) -> Self { const DEFAULT_CAPACITY: usize = 512; let encoder = ContainerEncodingKind::LengthPrefixed(LengthPrefixedEncoder { @@ -79,7 +87,11 @@ impl<'value, 'top> BinaryContainerWriter_1_1<'value, 'top> { parent_buffer: buffer, child_values_buffer: BumpVec::with_capacity_in(DEFAULT_CAPACITY, allocator), }); - Self { allocator, encoder } + Self { + allocator, + encoder, + write_options, + } } pub fn allocator(&self) -> &'top BumpAllocator { @@ -95,15 +107,19 @@ impl<'value, 'top> BinaryContainerWriter_1_1<'value, 'top> { matches!(self.encoder, ContainerEncodingKind::Delimited(_)) } + pub fn config(&self) -> ValueWriterConfig { + self.write_options + } + /// Constructs a new [`BinaryValueWriter_1_1`] using this [`BinaryContainerWriter_1_1`]'s /// allocator and targeting its child values buffer. fn value_writer<'a>(&'a mut self) -> BinaryValueWriter_1_1<'a, 'top> { + let value_writer_config = self.config(); // Create a value writer that will use the same container encodings it does by default - let delimited_containers = self.has_delimited_containers(); BinaryValueWriter_1_1::new( self.allocator, self.child_values_buffer(), - delimited_containers, + value_writer_config, ) } @@ -156,16 +172,22 @@ impl<'value, 'top> BinaryListWriter_1_1<'value, 'top> { pub(crate) fn new_delimited( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const DELIMITED_LIST_OPCODE: u8 = 0xF1; - let container_writer = - BinaryContainerWriter_1_1::new_delimited(DELIMITED_LIST_OPCODE, allocator, buffer); + let container_writer = BinaryContainerWriter_1_1::new_delimited( + DELIMITED_LIST_OPCODE, + allocator, + buffer, + value_writer_config, + ); Self::with_container_writer(container_writer) } pub(crate) fn new_length_prefixed( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const LENGTH_PREFIXED_LIST_TYPE_CODE: u8 = 0xB0; const LENGTH_PREFIXED_FLEX_LEN_LIST_TYPE_CODE: u8 = 0xFB; @@ -174,6 +196,7 @@ impl<'value, 'top> BinaryListWriter_1_1<'value, 'top> { LENGTH_PREFIXED_FLEX_LEN_LIST_TYPE_CODE, allocator, buffer, + value_writer_config, ); Self::with_container_writer(container_writer) } @@ -214,16 +237,22 @@ impl<'value, 'top> BinarySExpWriter_1_1<'value, 'top> { pub(crate) fn new_delimited( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const DELIMITED_SEXP_OPCODE: u8 = 0xF2; - let container_writer = - BinaryContainerWriter_1_1::new_delimited(DELIMITED_SEXP_OPCODE, allocator, buffer); + let container_writer = BinaryContainerWriter_1_1::new_delimited( + DELIMITED_SEXP_OPCODE, + allocator, + buffer, + value_writer_config, + ); Self::with_container_writer(container_writer) } pub(crate) fn new_length_prefixed( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const LENGTH_PREFIXED_SEXP_TYPE_CODE: u8 = 0xC0; const LENGTH_PREFIXED_FLEX_LEN_SEXP_TYPE_CODE: u8 = 0xFC; @@ -232,6 +261,7 @@ impl<'value, 'top> BinarySExpWriter_1_1<'value, 'top> { LENGTH_PREFIXED_FLEX_LEN_SEXP_TYPE_CODE, allocator, buffer, + value_writer_config, ); Self::with_container_writer(container_writer) } @@ -241,11 +271,11 @@ impl<'value, 'top> MakeValueWriter for BinarySExpWriter_1_1<'value, 'top> { type ValueWriter<'a> = BinaryValueWriter_1_1<'a, 'top> where Self: 'a; fn make_value_writer(&mut self) -> Self::ValueWriter<'_> { - let delimited_containers = self.container_writer.has_delimited_containers(); + let value_writer_config = self.container_writer.config(); BinaryValueWriter_1_1::new( self.container_writer.allocator(), self.container_writer.child_values_buffer(), - delimited_containers, + value_writer_config, ) } } @@ -277,6 +307,7 @@ impl<'value, 'top> BinaryStructWriter_1_1<'value, 'top> { pub(crate) fn new_length_prefixed( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const LENGTH_PREFIXED_STRUCT_TYPE_CODE: u8 = 0xD0; const LENGTH_PREFIXED_FLEX_LEN_STRUCT_TYPE_CODE: u8 = 0xFD; @@ -285,6 +316,7 @@ impl<'value, 'top> BinaryStructWriter_1_1<'value, 'top> { LENGTH_PREFIXED_FLEX_LEN_STRUCT_TYPE_CODE, allocator, buffer, + value_writer_config, ); Self { flex_uint_encoding: true, @@ -295,10 +327,15 @@ impl<'value, 'top> BinaryStructWriter_1_1<'value, 'top> { pub(crate) fn new_delimited( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, + value_writer_config: ValueWriterConfig, ) -> Self { const DELIMITED_STRUCT_OPCODE: u8 = 0xF3; - let container_writer = - BinaryContainerWriter_1_1::new_delimited(DELIMITED_STRUCT_OPCODE, allocator, buffer); + let container_writer = BinaryContainerWriter_1_1::new_delimited( + DELIMITED_STRUCT_OPCODE, + allocator, + buffer, + value_writer_config, + ); Self { // Delimited structs always use FlexSym encoding. flex_uint_encoding: false, @@ -358,19 +395,19 @@ impl<'value, 'top> StructWriter for BinaryStructWriter_1_1<'value, 'top> { pub struct BinaryEExpWriter_1_1<'value, 'top> { allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, - delimited_containers: bool, + value_writer_config: ValueWriterConfig, } impl<'value, 'top> BinaryEExpWriter_1_1<'value, 'top> { pub fn new( allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, - delimited_containers: bool, + value_writer_config: ValueWriterConfig, ) -> Self { Self { allocator, buffer, - delimited_containers, + value_writer_config, } } } @@ -379,7 +416,7 @@ impl<'value, 'top> MakeValueWriter for BinaryEExpWriter_1_1<'value, 'top> { type ValueWriter<'a> = BinaryValueWriter_1_1<'a, 'top> where Self: 'a; fn make_value_writer(&mut self) -> Self::ValueWriter<'_> { - BinaryValueWriter_1_1::new(self.allocator, self.buffer, self.delimited_containers) + BinaryValueWriter_1_1::new(self.allocator, self.buffer, self.value_writer_config) } } diff --git a/src/lazy/encoder/binary/v1_1/flex_sym.rs b/src/lazy/encoder/binary/v1_1/flex_sym.rs index f610562d..b084dd91 100644 --- a/src/lazy/encoder/binary/v1_1/flex_sym.rs +++ b/src/lazy/encoder/binary/v1_1/flex_sym.rs @@ -73,9 +73,14 @@ impl<'top> FlexSym<'top> { Ordering::Less => { let flex_int_len = value.size_in_bytes(); let len = sym_value.unsigned_abs() as usize; - let text = std::str::from_utf8(&input[flex_int_len..flex_int_len + len]).map_err( - |_| IonError::decoding_error("found FlexSym with invalid UTF-8 data"), - )?; + let flex_sym_end = flex_int_len + len; + if input.len() < flex_sym_end { + return IonResult::incomplete("reading a FlexSym", offset); + } + let text = + std::str::from_utf8(&input[flex_int_len..flex_sym_end]).map_err(|_| { + IonError::decoding_error("found FlexSym with invalid UTF-8 data") + })?; let symbol_ref = Text(text); (FlexSymValue::SymbolRef(symbol_ref), flex_int_len + len) } diff --git a/src/lazy/encoder/binary/v1_1/value_writer.rs b/src/lazy/encoder/binary/v1_1/value_writer.rs index 527c65cb..6e65e8ee 100644 --- a/src/lazy/encoder/binary/v1_1/value_writer.rs +++ b/src/lazy/encoder/binary/v1_1/value_writer.rs @@ -14,6 +14,10 @@ use crate::lazy::encoder::binary::v1_1::{flex_int::FlexInt, flex_uint::FlexUInt} use crate::lazy::encoder::private::Sealed; use crate::lazy::encoder::value_writer::ValueWriter; use crate::lazy::encoder::value_writer::{delegate_value_writer_to_self, AnnotatableWriter}; +use crate::lazy::encoder::value_writer_config::{ + AnnotationsEncoding, ContainerEncoding, FieldNameEncoding, SymbolValueEncoding, + ValueWriterConfig, +}; use crate::lazy::text::raw::v1_1::reader::MacroIdRef; use crate::raw_symbol_ref::AsRawSymbolRef; use crate::result::IonFailure; @@ -29,29 +33,64 @@ const DEFAULT_CONTAINER_BUFFER_SIZE: usize = 512; pub struct BinaryValueWriter_1_1<'value, 'top> { allocator: &'top BumpAllocator, encoding_buffer: &'value mut BumpVec<'top, u8>, - delimited_containers: bool, + value_writer_config: ValueWriterConfig, } impl<'value, 'top> BinaryValueWriter_1_1<'value, 'top> { pub fn new<'a, 'b: 'a>( allocator: &'b BumpAllocator, encoding_buffer: &'a mut BumpVec<'b, u8>, - delimited_containers: bool, + value_writer_config: ValueWriterConfig, ) -> BinaryValueWriter_1_1<'a, 'b> { BinaryValueWriter_1_1 { allocator, encoding_buffer, - delimited_containers, + value_writer_config, } } + pub fn config(&self) -> ValueWriterConfig { + self.value_writer_config + } + pub fn with_delimited_containers(mut self) -> Self { - self.delimited_containers = true; + self.value_writer_config = self.value_writer_config.with_delimited_containers(); + self + } + + pub fn with_inline_symbol_text(mut self) -> Self { + self.value_writer_config = self.value_writer_config.with_delimited_containers(); self } - pub fn with_length_prefixed_containers(mut self) -> Self { - self.delimited_containers = false; + pub fn with_container_encoding(mut self, container_encoding: ContainerEncoding) -> Self { + self.value_writer_config = self + .value_writer_config + .with_container_encoding(container_encoding); + self + } + + pub fn with_symbol_value_encoding( + mut self, + symbol_value_encoding: SymbolValueEncoding, + ) -> Self { + self.value_writer_config = self + .value_writer_config + .with_symbol_value_encoding(symbol_value_encoding); + self + } + + pub fn with_annotations_encoding(mut self, annotations_encoding: AnnotationsEncoding) -> Self { + self.value_writer_config = self + .value_writer_config + .with_annotations_encoding(annotations_encoding); + self + } + + pub fn with_field_name_encoding(mut self, field_name_encoding: FieldNameEncoding) -> Self { + self.value_writer_config = self + .value_writer_config + .with_field_name_encoding(field_name_encoding); self } @@ -585,28 +624,44 @@ impl<'value, 'top> BinaryValueWriter_1_1<'value, 'top> { } fn list_writer(self) -> IonResult<::ListWriter> { - let writer = if self.delimited_containers { - BinaryListWriter_1_1::new_delimited(self.allocator, self.encoding_buffer) + let writer = if self.config().has_delimited_containers() { + BinaryListWriter_1_1::new_delimited(self.allocator, self.encoding_buffer, self.config()) } else { - BinaryListWriter_1_1::new_length_prefixed(self.allocator, self.encoding_buffer) + BinaryListWriter_1_1::new_length_prefixed( + self.allocator, + self.encoding_buffer, + self.config(), + ) }; Ok(writer) } fn sexp_writer(self) -> IonResult<::SExpWriter> { - let writer = if self.delimited_containers { - BinarySExpWriter_1_1::new_delimited(self.allocator, self.encoding_buffer) + let writer = if self.config().has_delimited_containers() { + BinarySExpWriter_1_1::new_delimited(self.allocator, self.encoding_buffer, self.config()) } else { - BinarySExpWriter_1_1::new_length_prefixed(self.allocator, self.encoding_buffer) + BinarySExpWriter_1_1::new_length_prefixed( + self.allocator, + self.encoding_buffer, + self.config(), + ) }; Ok(writer) } fn struct_writer(self) -> IonResult<::StructWriter> { - let writer = if self.delimited_containers { - BinaryStructWriter_1_1::new_delimited(self.allocator, self.encoding_buffer) + let writer = if self.config().has_delimited_containers() { + BinaryStructWriter_1_1::new_delimited( + self.allocator, + self.encoding_buffer, + self.config(), + ) } else { - BinaryStructWriter_1_1::new_length_prefixed(self.allocator, self.encoding_buffer) + BinaryStructWriter_1_1::new_length_prefixed( + self.allocator, + self.encoding_buffer, + self.config(), + ) }; Ok(writer) } @@ -631,7 +686,7 @@ impl<'value, 'top> BinaryValueWriter_1_1<'value, 'top> { Ok(BinaryEExpWriter_1_1::new( self.allocator, self.encoding_buffer, - self.delimited_containers, + self.config(), )) } } @@ -653,6 +708,7 @@ impl<'value, 'top> AnnotatableWriter for BinaryValueWriter_1_1<'value, 'top> { self.allocator, self.encoding_buffer, annotations.into_annotations_vec(), + self.config(), )) } } @@ -665,6 +721,10 @@ impl<'value, 'top> ValueWriter for BinaryValueWriter_1_1<'value, 'top> { type EExpWriter = BinaryEExpWriter_1_1<'value, 'top>; delegate_value_writer_to_self!(); + + fn config(&self) -> ValueWriterConfig { + self.config() + } } /// Takes a series of `TYPE => METHOD` pairs, generating a function for each that encodes an @@ -678,7 +738,11 @@ macro_rules! annotate_and_delegate_1_1 { fn $method(mut self, value: $value_type) -> IonResult<()> { self.encode_annotations(); // We've encoded the annotations, now create a no-annotations ValueWriter to encode the value itself. - let value_writer = $crate::lazy::encoder::binary::v1_1::value_writer::BinaryValueWriter_1_1::new(self.allocator, self.buffer, self.delimited_containers); + let value_writer = $crate::lazy::encoder::binary::v1_1::value_writer::BinaryValueWriter_1_1::new( + self.allocator, + self.buffer, + self.config(), + ); value_writer.$method(value)?; Ok(()) } @@ -690,7 +754,7 @@ pub struct BinaryAnnotatedValueWriter_1_1<'value, 'top> { annotations: AnnotationsVec<'value>, allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, - delimited_containers: bool, + value_writer_config: ValueWriterConfig, } impl<'value, 'top> BinaryAnnotatedValueWriter_1_1<'value, 'top> { @@ -757,6 +821,7 @@ impl<'value, 'top> AnnotatableWriter for BinaryAnnotatedValueWriter_1_1<'value, self.allocator, self.buffer, annotations.into_annotations_vec(), + self.config(), )) } } @@ -803,6 +868,10 @@ impl<'value, 'top> ValueWriter for BinaryAnnotatedValueWriter_1_1<'value, 'top> } self.value_writer().eexp_writer(macro_id) } + + fn config(&self) -> ValueWriterConfig { + self.value_writer_config + } } impl<'value, 'top> BinaryAnnotatedValueWriter_1_1<'value, 'top> { @@ -810,18 +879,17 @@ impl<'value, 'top> BinaryAnnotatedValueWriter_1_1<'value, 'top> { allocator: &'top BumpAllocator, buffer: &'value mut BumpVec<'top, u8>, annotations: AnnotationsVec<'value>, + value_writer_config: ValueWriterConfig, ) -> Self { Self { allocator, buffer, annotations, - delimited_containers: false, + value_writer_config, } } pub(crate) fn value_writer(self) -> BinaryValueWriter_1_1<'value, 'top> { - let mut writer = - BinaryValueWriter_1_1::new(self.allocator, self.buffer, self.delimited_containers); - writer.delimited_containers = self.delimited_containers; + let writer = BinaryValueWriter_1_1::new(self.allocator, self.buffer, self.config()); writer } diff --git a/src/lazy/encoder/binary/v1_1/writer.rs b/src/lazy/encoder/binary/v1_1/writer.rs index c7c5f56c..776fa9b3 100644 --- a/src/lazy/encoder/binary/v1_1/writer.rs +++ b/src/lazy/encoder/binary/v1_1/writer.rs @@ -8,6 +8,7 @@ use crate::lazy::encoder::binary::v1_1::value_writer::BinaryValueWriter_1_1; use crate::lazy::encoder::private::Sealed; use crate::lazy::encoder::value_writer::internal::MakeValueWriter; use crate::lazy::encoder::value_writer::SequenceWriter; +use crate::lazy::encoder::value_writer_config::ValueWriterConfig; use crate::lazy::encoder::write_as_ion::WriteAsIon; use crate::lazy::encoder::LazyRawWriter; use crate::lazy::encoding::Encoding; @@ -110,7 +111,7 @@ impl LazyRawBinaryWriter_1_1 { &self.allocator, top_level, // By default, writers use length-prefixed encodings. - false, + ValueWriterConfig::default(), ) } } diff --git a/src/lazy/encoder/mod.rs b/src/lazy/encoder/mod.rs index 386d5302..ceec5c20 100644 --- a/src/lazy/encoder/mod.rs +++ b/src/lazy/encoder/mod.rs @@ -14,6 +14,7 @@ pub mod annotation_seq; pub mod binary; pub mod text; pub mod value_writer; +pub mod value_writer_config; pub mod write_as_ion; pub mod writer; diff --git a/src/lazy/encoder/value_writer.rs b/src/lazy/encoder/value_writer.rs index 3b8e3456..9143bcf5 100644 --- a/src/lazy/encoder/value_writer.rs +++ b/src/lazy/encoder/value_writer.rs @@ -99,6 +99,10 @@ pub trait ValueWriter: AnnotatableWriter + Sized { strukt.write_all(values)?; strukt.close() } + + fn config(&self) -> ValueWriterConfig { + ValueWriterConfig::default() + } } /// There are several implementations of `ValueWriter` that simply delegate calls to an expression. @@ -193,6 +197,7 @@ macro_rules! delegate_value_writer_to_self { }; } +use crate::lazy::encoder::value_writer_config::ValueWriterConfig; pub(crate) use delegate_value_writer_to; pub(crate) use delegate_value_writer_to_self; diff --git a/src/lazy/encoder/value_writer_config.rs b/src/lazy/encoder/value_writer_config.rs new file mode 100644 index 00000000..b0db84fd --- /dev/null +++ b/src/lazy/encoder/value_writer_config.rs @@ -0,0 +1,143 @@ +/// Configuration options available to Ion 1.1 value writers. +/// +/// The default configuration aligns closely with Ion 1.0's encoding. All symbols, field names, and +/// annotations are added to the symbol table and encoded as symbol IDs. +#[derive(Copy, Clone, Debug, Default)] +pub struct ValueWriterConfig { + // How nested containers should be encoded. + container_encoding: ContainerEncoding, + // How symbol values should be encoded. + symbol_value_encoding: SymbolValueEncoding, + // How annotation sequences should be encoded + annotations_encoding: AnnotationsEncoding, + // If this writer emits a struct, the struct will encode its field names according to this setting. + field_name_encoding: FieldNameEncoding, +} + +/// Configuration options for encoding containers. +#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)] +pub enum ContainerEncoding { + /// The container's length will be prepended to its contents. This requires more work for the + /// writer, but allows the reader to traverse the stream more quickly via skip-scanning. + #[default] + LengthPrefixed, + /// The start and end of the container will be encoded as opcodes. This requires less work for + /// the writer, but the reader will not be able to skip over the container. + Delimited, +} + +// ===== Symbol text encoding policies ===== +// +// The types below are very similar to one another. They have been kept distinct for two reasons: +// 1. It makes code setting these options a bit more obvious/self-documenting. +// 2. It leaves the door open to adding distinctive new options to each individually. + +/// Configuration options for encoding a struct field name. +#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)] +#[non_exhaustive] +pub enum SymbolValueEncoding { + /// Add all symbol values to the symbol table and encode them as symbol IDs. + #[default] + WriteAsSymbolIds, + /// Do not add symbol values to the symbol table; write their text inline. + /// Symbol values specified as symbol IDs will not be mapped to text. + WriteAsInlineText, + /// If a symbol value is already in the symbol table, encode it as a symbol ID. + /// If it is not already in the symbol table, encode its text inline. + WriteNewSymbolsAsInlineText, +} + +/// Configuration options for encoding an annotations sequence. +#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)] +#[non_exhaustive] +pub enum AnnotationsEncoding { + /// Add all annotations to the symbol table and encode them as symbol IDs. + #[default] + WriteAsSymbolIds, + /// Do not add annotations to the symbol table; write their text inline. + /// Annotations specified as symbol IDs will not be mapped to text. + WriteAsInlineText, + /// If an annotation is already in the symbol table, encode it as a symbol ID. + /// If it is not already in the symbol table, encode its text inline. + WriteNewSymbolsAsInlineText, +} + +/// Configuration options for encoding a struct field name. +#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)] +#[non_exhaustive] +pub enum FieldNameEncoding { + /// Add all field names to the symbol table and encode them as symbol IDs. + #[default] + WriteAsSymbolIds, + /// Do not add field names to the symbol table; write their text inline. + /// Field names specified as symbol IDs will not be mapped to text. + WriteAsInlineText, + /// If a field name is already in the symbol table, encode it as a symbol ID. + /// If it is not already in the symbol table, encode its text inline. + WriteNewSymbolsAsInlineText, +} + +impl ValueWriterConfig { + /// Constructs a default `ValueWriterConfig`. + pub fn new() -> Self { + ValueWriterConfig::default() + } + + pub fn container_encoding(&self) -> ContainerEncoding { + self.container_encoding + } + + pub fn symbol_value_encoding(&self) -> SymbolValueEncoding { + self.symbol_value_encoding + } + + pub fn field_name_encoding(&self) -> FieldNameEncoding { + self.field_name_encoding + } + + pub fn annotations_encoding(&self) -> AnnotationsEncoding { + self.annotations_encoding + } + + /// Returns `true` if this value writer will write nested containers with a delimited encoding. + pub fn has_delimited_containers(&self) -> bool { + self.container_encoding == ContainerEncoding::Delimited + } + + /// Configures this value writer will write nested containers using a delimited encoding. If it + /// is `false`, nested containers will be length-prefixed. + pub fn with_delimited_containers(mut self) -> Self { + self.container_encoding = ContainerEncoding::Delimited; + self + } + + /// If `delimited_containers` is `true`, this value writer will write nested containers using + /// a delimited encoding. If it is `false`, nested containers will be length-prefixed. + pub fn with_container_encoding(mut self, container_encoding: ContainerEncoding) -> Self { + self.container_encoding = container_encoding; + self + } + + /// Configures this value writer to write symbol values and annotations with their UTF-8 text + /// inline. + pub fn with_symbol_value_encoding( + mut self, + symbol_value_encoding: SymbolValueEncoding, + ) -> Self { + self.symbol_value_encoding = symbol_value_encoding; + self + } + + /// Configures how this value writer will encode its annotations (if any). + pub fn with_annotations_encoding(mut self, annotations_encoding: AnnotationsEncoding) -> Self { + self.annotations_encoding = annotations_encoding; + self + } + + /// If this value writer is used to write a struct, the struct be configured to encode its + /// field names according to the specified t`field_name_encoding`. + pub fn with_field_name_encoding(mut self, field_name_encoding: FieldNameEncoding) -> Self { + self.field_name_encoding = field_name_encoding; + self + } +} diff --git a/src/lazy/encoder/writer.rs b/src/lazy/encoder/writer.rs index 6088b5c4..d4068eb4 100644 --- a/src/lazy/encoder/writer.rs +++ b/src/lazy/encoder/writer.rs @@ -4,11 +4,16 @@ use delegate::delegate; use ice_code::ice as cold_path; use crate::constants::v1_0::system_symbol_ids; -use crate::lazy::encoder::annotation_seq::AnnotationSeq; +use crate::lazy::encoder::annotation_seq::{AnnotationSeq, AnnotationsVec}; +use crate::lazy::encoder::binary::v1_1::value_writer::BinaryValueWriter_1_1; use crate::lazy::encoder::value_writer::internal::{FieldEncoder, MakeValueWriter}; use crate::lazy::encoder::value_writer::{ AnnotatableWriter, EExpWriter, SequenceWriter, StructWriter, ValueWriter, }; +use crate::lazy::encoder::value_writer_config::{ + AnnotationsEncoding, ContainerEncoding, FieldNameEncoding, SymbolValueEncoding, + ValueWriterConfig, +}; use crate::lazy::encoder::write_as_ion::WriteAsIon; use crate::lazy::encoder::{LazyRawWriter, SymbolCreationPolicy}; use crate::lazy::encoding::{ @@ -19,8 +24,8 @@ use crate::raw_symbol_ref::AsRawSymbolRef; use crate::result::IonFailure; use crate::write_config::WriteConfig; use crate::{ - Decimal, Element, ElementWriter, Int, IonResult, IonType, RawSymbolRef, Symbol, SymbolId, - SymbolTable, Timestamp, UInt, Value, + Decimal, Element, ElementWriter, Int, IonResult, IonType, RawSymbolRef, Symbol, SymbolTable, + Timestamp, UInt, Value, }; pub(crate) struct WriteContext { @@ -197,6 +202,43 @@ impl<'a, V: ValueWriter> ApplicationValueWriter<'a, V> { } } +impl<'a, 'value, 'top> ApplicationValueWriter<'a, BinaryValueWriter_1_1<'value, 'top>> { + pub fn config(&self) -> ValueWriterConfig { + self.raw_value_writer.config() + } + + pub fn with_container_encoding(mut self, container_encoding: ContainerEncoding) -> Self { + self.raw_value_writer = self + .raw_value_writer + .with_container_encoding(container_encoding); + self + } + + pub fn with_annotations_encoding(mut self, annotations_encoding: AnnotationsEncoding) -> Self { + self.raw_value_writer = self + .raw_value_writer + .with_annotations_encoding(annotations_encoding); + self + } + + pub fn with_symbol_value_encoding( + mut self, + symbol_value_encoding: SymbolValueEncoding, + ) -> Self { + self.raw_value_writer = self + .raw_value_writer + .with_symbol_value_encoding(symbol_value_encoding); + self + } + + pub fn with_field_name_encoding(mut self, field_name_encoding: FieldNameEncoding) -> Self { + self.raw_value_writer = self + .raw_value_writer + .with_field_name_encoding(field_name_encoding); + self + } +} + impl<'value, V: ValueWriter> AnnotatableWriter for ApplicationValueWriter<'value, V> { type AnnotatedValueWriter<'a> = ApplicationValueWriter<'a, V::AnnotatedValueWriter<'a>> where Self: 'a; @@ -207,41 +249,125 @@ impl<'value, V: ValueWriter> AnnotatableWriter for ApplicationValueWriter<'value where Self: 'a, { - if self.encoding.symbol_creation_policy == SymbolCreationPolicy::WriteProvidedToken { - // Store the tokens as they are. Text will be written as text, symbol IDs will be written - // as symbol IDs. TODO: Lookup SIDs to see if they have text? - return Ok(ApplicationValueWriter { - encoding: self.encoding, - raw_value_writer: self.raw_value_writer.with_annotations(annotations)?, - }); - } - - // Otherwise, we're going to write everything as a symbol ID. Replace all text tokens in the - // annotations with the corresponding symbol ID, creating a new one if necessary. let mut annotations = annotations.into_annotations_vec(); - for annotation in &mut annotations { - let sid: SymbolId = match annotation.as_raw_symbol_token_ref() { + match self.config().annotations_encoding() { + AnnotationsEncoding::WriteAsSymbolIds => { + // Intern all text so everything we write is a symbol ID + self.map_all_annotations_to_symbol_ids(&mut annotations)? + } + AnnotationsEncoding::WriteAsInlineText => { + // Validate the symbol IDs, write the text as-is + self.validate_all_symbol_ids(&mut annotations)? + } + AnnotationsEncoding::WriteNewSymbolsAsInlineText => { + // Map all known strings to symbol IDs, leave new text as is. + self.map_known_symbols_to_symbol_ids(&mut annotations)? + } + }; + + Ok(ApplicationValueWriter { + encoding: self.encoding, + raw_value_writer: self.raw_value_writer.with_annotations(annotations)?, + }) + } +} + +impl<'value, V: ValueWriter> ApplicationValueWriter<'value, V> { + /// Converts each annotation in `annotations` to a symbol ID, adding symbols to the symbol table + /// as necessary. If one of the annotations is a symbol ID that is not in the symbol table, + /// returns an `Err`. + fn map_all_annotations_to_symbol_ids<'a>( + &mut self, + annotations: &mut AnnotationsVec<'a>, + ) -> IonResult<()> + where + Self: 'a, + { + for annotation in annotations { + match annotation.as_raw_symbol_token_ref() { // The token is already a symbol ID. - RawSymbolRef::SymbolId(sid) => sid, + RawSymbolRef::SymbolId(sid) => { + if !self.symbol_table().sid_is_valid(sid) { + return IonResult::encoding_error(format!( + "annotation symbol ID {sid} is out of range" + )); + } + } // The token is text... RawSymbolRef::Text(text) => { - if let Some(sid) = self.symbol_table().sid_for(&text) { - //...that was already in the symbol table. - sid - } else { - // ...that we need to add to the symbol table. - self.encoding.num_pending_symbols += 1; - self.symbol_table().add_symbol_for_text(text) - } + let sid = match self.symbol_table().sid_for(&text) { + Some(sid) => { + //...that was already in the symbol table. + sid + } + None => { + // ...that we need to add to the symbol table. + self.encoding.num_pending_symbols += 1; + self.symbol_table().add_symbol_for_text(text) + } + }; + *annotation = RawSymbolRef::SymbolId(sid); } }; - *annotation = RawSymbolRef::SymbolId(sid); } + Ok(()) + } - Ok(ApplicationValueWriter { - encoding: self.encoding, - raw_value_writer: self.raw_value_writer.with_annotations(annotations)?, - }) + /// Confirms all SIDs are in the symbol table while leaving text annotations as-is. + pub(crate) fn validate_all_symbol_ids<'a>( + &mut self, + annotations: &mut AnnotationsVec<'a>, + ) -> IonResult<()> + where + Self: 'a, + { + for annotation in annotations { + if let RawSymbolRef::SymbolId(sid) = annotation.as_raw_symbol_token_ref() { + if !self.symbol_table().sid_is_valid(sid) { + return IonResult::encoding_error(format!( + "annotation symbol ID {sid} is not in the symbol table" + )); + } + } + } + Ok(()) + } + + /// Converts annotations with known text to their corresponding symbol ID. Annotations with + /// text not yet in the symbol table are left as-is. If a symbol ID is not in the symbol table, + /// returns an `Err`. + fn map_known_symbols_to_symbol_ids<'a>( + &mut self, + annotations: &mut AnnotationsVec<'a>, + ) -> IonResult<()> + where + Self: 'a, + { + for annotation in annotations { + match annotation.as_raw_symbol_token_ref() { + // The token is already a symbol ID. + RawSymbolRef::SymbolId(sid) => { + if !self.symbol_table().sid_is_valid(sid) { + return IonResult::encoding_error(format!( + "annotation symbol ID {sid} is out of range" + )); + } + } + // The token is text... + RawSymbolRef::Text(text) => { + match self.symbol_table().sid_for(&text) { + Some(sid) => { + //...that was already in the symbol table. + *annotation = RawSymbolRef::SymbolId(sid); + } + None => { + // ...that is not in the symbol table. Leave it as-is. + } + }; + } + }; + } + Ok(()) } } @@ -264,45 +390,60 @@ impl<'value, V: ValueWriter> ValueWriter for ApplicationValueWriter<'value, V> { fn write_string(self, value: impl AsRef) -> IonResult<()>; fn write_clob(self, value: impl AsRef<[u8]>) -> IonResult<()>; fn write_blob(self, value: impl AsRef<[u8]>) -> IonResult<()>; + fn config(&self) -> ValueWriterConfig; } } - fn write_symbol(mut self, value: impl AsRawSymbolRef) -> IonResult<()> { - // If it's a symbol ID, do a bounds check and then write it. - // Otherwise, get its associated text. - let text = match value.as_raw_symbol_token_ref() { - RawSymbolRef::SymbolId(symbol_id) => { - if !self.symbol_table().sid_is_valid(symbol_id) { + fn write_symbol(self, value: impl AsRawSymbolRef) -> IonResult<()> { + use RawSymbolRef::*; + use SymbolValueEncoding::*; + + let config = self.config(); + let Self { + encoding, + raw_value_writer, + } = self; + + // Depending on the symbol value encoding config option, map the provided symbol reference + // from text to SID or vice versa, performing any validation needed. + let symbol_ref = match value.as_raw_symbol_token_ref() { + SymbolId(symbol_id) => { + // We can write the symbol ID as-is. Make sure it's in the symbol table. + if !encoding.symbol_table.sid_is_valid(symbol_id) { return cold_path!(IonResult::encoding_error(format!( - "symbol ID ${symbol_id} is out of bounds" + "symbol value ID ${symbol_id} is not in the symbol table" ))); } - return self.raw_value_writer.write_symbol(symbol_id); + SymbolId(symbol_id) } - RawSymbolRef::Text(text) => text, - }; - - // If the writer can write it as inline text, do so. - if self.encoding.supports_text_tokens - && self.encoding.symbol_creation_policy == SymbolCreationPolicy::WriteProvidedToken - { - return self.raw_value_writer.write_symbol(text); - } - - // Otherwise, see if the symbol is already in the symbol table. - let symbol_id = match self.symbol_table().sid_for(&text) { - // If so, use the existing ID. - Some(sid) => sid, - // If not, add it to the symbol table and make a note to add it to the LST on the next - // call to `flush()`. Use the new ID. - None => { - self.encoding.num_pending_symbols += 1; - self.symbol_table().add_symbol_for_text(text) + Text(text) => { + match config.symbol_value_encoding() { + WriteAsSymbolIds => { + // Map the text to a symbol ID. + match encoding.symbol_table.sid_for(&text) { + // If it's already in the symbol table, use that SID. + Some(symbol_id) => SymbolId(symbol_id), + // Otherwise, add it to the symbol table. + None => { + encoding.num_pending_symbols += 1; + SymbolId(encoding.symbol_table.add_symbol_for_text(text)) + } + } + } + WriteNewSymbolsAsInlineText => { + // If the text is in the symbol table, use the symbol ID. Otherwise, use the text itself. + match encoding.symbol_table.sid_for(&text) { + Some(symbol_id) => SymbolId(symbol_id), + None => Text(text), + } + } + // We have text and we want to write text. Nothing to do. + WriteAsInlineText => Text(text), + } } }; - // Finally, write out the SID. - self.raw_value_writer.write_symbol(symbol_id) + raw_value_writer.write_symbol(symbol_ref) } fn list_writer(self) -> IonResult { @@ -369,7 +510,7 @@ impl<'value, V: ValueWriter> FieldEncoder for ApplicationStructWriter<'value, V> RawSymbolRef::SymbolId(symbol_id) => { if !self.encoding.symbol_table.sid_is_valid(symbol_id) { return cold_path!(IonResult::encoding_error(format!( - "symbol ID ${symbol_id} is out of bounds" + "symbol ID ${symbol_id} is not in the symbol table" ))); } return self.raw_struct_writer.encode_field_name(symbol_id); @@ -521,3 +662,188 @@ impl ElementWriter for S { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::lazy::encoder::value_writer::AnnotatableWriter; + use crate::lazy::encoder::value_writer_config::{AnnotationsEncoding, SymbolValueEncoding}; + use crate::raw_symbol_ref::AsRawSymbolRef; + use crate::{ + v1_1, HasSpan, IonResult, LazyRawValue, RawSymbolRef, SequenceWriter, SystemReader, + ValueWriter, Writer, + }; + + fn symbol_value_encoding_test( + encoding: SymbolValueEncoding, + symbol_and_encoding_pairs: [(A, &[u8]); N], + ) -> IonResult<()> { + let mut writer = Writer::new(v1_1::Binary, Vec::new())?; + for (symbol, _expected_bytes) in &symbol_and_encoding_pairs { + writer + .value_writer() + .with_symbol_value_encoding(encoding) + .write_symbol(symbol)?; + } + let bytes = writer.close()?; + let mut reader = SystemReader::new(v1_1::Binary, bytes.as_slice()); + for (symbol, expected_bytes) in &symbol_and_encoding_pairs { + let value = reader.expect_next_value()?; + let raw_value = value.raw().unwrap(); + let actual_bytes = raw_value.span().bytes(); + assert_eq!( + actual_bytes, *expected_bytes, + "{:02X?} != {:02X?}", + actual_bytes, expected_bytes + ); + println!( + "{:?} {:02X?} == {:02X?}", + symbol.as_raw_symbol_token_ref(), + actual_bytes, + expected_bytes + ) + } + Ok(()) + } + + #[test] + fn intern_new_symbol_values() -> IonResult<()> { + use RawSymbolRef::*; + symbol_value_encoding_test( + SymbolValueEncoding::WriteAsSymbolIds, + [ + (Text("$ion_symbol_table"), &[0xE1, 0x03]), + (Text("name"), &[0xE1, 0x04]), + (SymbolId(6), &[0xE1, 0x06]), // SIDs are written as-is + (Text("foo"), &[0xE1, 0x0A]), // Text is added to the symbol table and encoded as a SID + ], + ) + } + + #[test] + fn do_not_intern_new_symbol_values() -> IonResult<()> { + use RawSymbolRef::*; + symbol_value_encoding_test( + SymbolValueEncoding::WriteNewSymbolsAsInlineText, + [ + // Known text symbols are written as SIDs + (Text("$ion_symbol_table"), &[0xE1, 0x03]), + (Text("name"), &[0xE1, 0x04]), + // SIDs are written as-is + (SymbolId(6), &[0xE1, 0x06]), + // New text symbols are written as inline text + // f o o + (Text("foo"), &[0xA3, 0x66, 0x6F, 0x6F]), + ], + ) + } + + #[test] + fn encode_all_text_as_is() -> IonResult<()> { + use RawSymbolRef::*; + symbol_value_encoding_test( + SymbolValueEncoding::WriteAsInlineText, + [ + // Known text symbols are written as inline text + (Text("name"), &[0xA4, 0x6E, 0x61, 0x6D, 0x65]), + // SIDs are written as-is + (SymbolId(6), &[0xE1, 0x06]), + // New text symbols are written as inline text + // f o o + (Text("foo"), &[0xA3, 0x66, 0x6F, 0x6F]), + ], + ) + } + + fn annotations_sequence_encoding_test( + encoding: AnnotationsEncoding, + sequence: &[RawSymbolRef], + expected_encoding: &[u8], + ) -> IonResult<()> { + let mut writer = Writer::new(v1_1::Binary, Vec::new())?; + writer + .value_writer() + .with_annotations_encoding(encoding) + .with_annotations(sequence)? + .write_string("foo")?; + let bytes = writer.close()?; + let mut reader = SystemReader::new(v1_1::Binary, bytes.as_slice()); + let value = reader.expect_next_value()?; + let raw_value = value.raw().unwrap(); + let annotations = raw_value.annotations_span(); + assert_eq!( + annotations.bytes(), + expected_encoding, + "{:02X?} != {:02X?}", + annotations.bytes(), + expected_encoding + ); + Ok(()) + } + + #[test] + fn intern_new_annotations() -> IonResult<()> { + use RawSymbolRef::*; + annotations_sequence_encoding_test( + AnnotationsEncoding::WriteAsSymbolIds, + &[ + Text("$ion_symbol_table"), + Text("name"), + SymbolId(6), + Text("foo"), + ], + &[ + 0xE9, // Opcode: FlexUInt follows with byte length of sequence + 0x09, // FlexUInt byte length: 4 + 0x07, // FlexSym SID $3 + 0x09, // FlexSym SID $4 + 0x0D, // FlexSym SID $6 + 0x15, // FlexSym SID $10 + ], + ) + } + + #[test] + fn write_new_annotations_as_text() -> IonResult<()> { + use RawSymbolRef::*; + annotations_sequence_encoding_test( + AnnotationsEncoding::WriteNewSymbolsAsInlineText, + &[ + Text("$ion_symbol_table"), + Text("name"), + SymbolId(6), + Text("foo"), + ], + &[ + 0xE9, // Opcode: FlexUInt follows with byte length of sequence + 0x0F, // FlexUInt byte length: 7 + 0x07, // FlexSym: SID $3 + 0x09, // FlexSym: SID $4 + 0x0D, // FlexSym: SID $6 + 0xFB, // FlexSym: 3 UTF-8 bytes + // f o o + 0x66, 0x6F, 0x6F, + ], + ) + } + + #[test] + #[rustfmt::skip] + fn write_text_annotations_as_is() -> IonResult<()> { + use RawSymbolRef::*; + annotations_sequence_encoding_test( + AnnotationsEncoding::WriteAsInlineText, + &[Text("name"), SymbolId(6), Text("foo")], + &[ + 0xE9, // Opcode: FlexUInt follows with byte length of sequence + 0x15, // FlexUInt byte length: 10 + 0xF9, // FlexSym: 4 UTF-8 bytes + // n a m e + 0x6E, 0x61, 0x6D, 0x65, + 0x0D, // FlexSym: SID $6 + 0xFB, // FlexSym: 3 UTF-8 bytes + // f o o + 0x66, 0x6F, 0x6F, + ], + ) + } +} diff --git a/src/lib.rs b/src/lib.rs index 1d408d09..451ac2e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -279,7 +279,14 @@ macro_rules! v1_x_tooling_apis { LazyRawContainer, }, lazy::encoder::{ - LazyRawWriter + LazyRawWriter, + }, + lazy::encoder::value_writer_config::{ + ValueWriterConfig, + ContainerEncoding, + SymbolValueEncoding, + AnnotationsEncoding, + FieldNameEncoding, }, lazy::expanded::r#struct::{ LazyExpandedStruct, ExpandedStructSource, diff --git a/src/serde/de.rs b/src/serde/de.rs index 7364dd4a..a80176d9 100644 --- a/src/serde/de.rs +++ b/src/serde/de.rs @@ -9,7 +9,10 @@ use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; use crate::serde::decimal::TUNNELED_DECIMAL_TYPE_NAME; use crate::serde::timestamp::TUNNELED_TIMESTAMP_TYPE_NAME; -use crate::{Decimal, IonError, IonResult, IonType, SystemReader, SystemStreamItem, Timestamp}; +use crate::{ + Decimal, IonEncoding, IonError, IonResult, IonType, RawVersionMarker, SystemReader, + SystemStreamItem, Timestamp, +}; /// Generic method that can deserialize an object from any given type /// that implements `IonInput`. @@ -18,26 +21,28 @@ where T: DeserializeOwned, I: IonInput, { + let mut ion_encoding = IonEncoding::default(); let mut reader = SystemReader::new(AnyEncoding, input); - let item = reader.next_item()?; - match item { - SystemStreamItem::VersionMarker(marker) => { - // Note that this uses the Ion version with which the IVM was encoded rather than - // the Ion version the stream is switching to. We can do this because the format - // (i.e text or binary) stays the same when the version changes. - // TODO: Use new encoding, once we have APIs to get new/old encodings for the marker. - let is_human_readable = marker.encoding().is_text(); - let value = reader.expect_next_value()?; - let value_deserializer = ValueDeserializer::new(&value, is_human_readable); - T::deserialize(value_deserializer) - } - SystemStreamItem::Value(value) => { - let value_deserializer = ValueDeserializer::new(&value, true); - T::deserialize(value_deserializer) + loop { + match reader.next_item()? { + SystemStreamItem::VersionMarker(marker) => { + // It's a version marker; update the detected Ion encoding + ion_encoding = marker.stream_encoding_after_marker()?; + } + SystemStreamItem::Value(value) => { + let value_deserializer = ValueDeserializer::new( + &value, + /*is_human_readable=*/ ion_encoding.is_text(), + ); + return T::deserialize(value_deserializer); + } + SystemStreamItem::EndOfStream(_end) => { + return IonResult::decoding_error("stream did not contain any values") + } + _system_value => { + // Ignore system values + } } - _ => IonResult::decoding_error( - "The first item found as symbol table or end of stream while reading", - ), } }