From 1f8fd89bf2096076f15b55bc3b007206e870fb8f Mon Sep 17 00:00:00 2001 From: AndreaOddo89 Date: Thu, 15 Sep 2022 11:08:18 +0200 Subject: [PATCH 1/3] Add support for source byte-range tracking for ByteRecord --- src/byte_record.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 +- src/reader.rs | 3 ++- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/src/byte_record.rs b/src/byte_record.rs index 4ccbb96..8b3a0e3 100644 --- a/src/byte_record.rs +++ b/src/byte_record.rs @@ -86,6 +86,8 @@ impl fmt::Debug for ByteRecord { struct ByteRecordInner { /// The position of this byte record. pos: Option, + /// The source span represented by this byte record. + span: Option, /// All fields in this record, stored contiguously. fields: Vec, /// The number of and location of each field in this record. @@ -136,6 +138,7 @@ impl ByteRecord { pub fn with_capacity(buffer: usize, fields: usize) -> ByteRecord { ByteRecord(Box::new(ByteRecordInner { pos: None, + span: None, fields: vec![0; buffer], bounds: Bounds::with_capacity(fields), })) @@ -370,6 +373,7 @@ impl ByteRecord { let mut trimmed = ByteRecord::with_capacity(self.as_slice().len(), self.len()); trimmed.set_position(self.position().cloned()); + trimmed.set_span(self.span().cloned()); for field in &*self { trimmed.push_field(field.trim()); } @@ -460,6 +464,18 @@ impl ByteRecord { self.0.pos = pos; } + /// Return the source span of this record, if available. + #[inline] + pub fn span(&self) -> Option<&Span> { + self.0.span.as_ref() + } + + /// Sets the source span of this record. + #[inline] + pub fn set_span(&mut self, span: Option) { + self.0.span = span; + } + /// Return the start and end position of a field in this record. /// /// If no such field exists at the given index, then return `None`. @@ -639,6 +655,40 @@ impl Position { } } +/// A span in a CSV source bytes +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Span { + start: u64, + end: u64, +} + +impl Span { + /// Create a new span + #[inline] + pub fn new(start: u64, end: u64) -> Self { + assert!(end >= start); + Span { start, end } + } + + /// Retrieve the start of this span + #[inline] + pub fn start(&self) -> u64 { + self.start + } + + /// Retrieve the end of this span + #[inline] + pub fn end(&self) -> u64 { + self.end + } + + /// Retrieve the length of this span + #[inline] + pub fn len(&self) -> usize { + (self.end - self.start) as usize + } +} + /// The bounds of fields in a single record. #[derive(Clone, Debug, Eq, PartialEq)] struct Bounds { diff --git a/src/lib.rs b/src/lib.rs index 3c771c9..116c965 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -153,7 +153,7 @@ use std::result; use serde::{Deserialize, Deserializer}; -pub use crate::byte_record::{ByteRecord, ByteRecordIter, Position}; +pub use crate::byte_record::{ByteRecord, ByteRecordIter, Position, Span}; pub use crate::deserializer::{DeserializeError, DeserializeErrorKind}; pub use crate::error::{ Error, ErrorKind, FromUtf8Error, IntoInnerError, Result, Utf8Error, diff --git a/src/reader.rs b/src/reader.rs index 3d66eea..79e0886 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -7,7 +7,7 @@ use std::result; use csv_core::{Reader as CoreReader, ReaderBuilder as CoreReaderBuilder}; use serde::de::DeserializeOwned; -use crate::byte_record::{ByteRecord, Position}; +use crate::byte_record::{ByteRecord, Position, Span}; use crate::error::{Error, ErrorKind, Result, Utf8Error}; use crate::string_record::StringRecord; use crate::{Terminator, Trim}; @@ -1667,6 +1667,7 @@ impl Reader { } Record => { record.set_len(endlen); + record.set_span(Some(Span::new(record.position().unwrap().byte(), self.state.cur_pos.byte()))); self.state.add_record(record)?; return Ok(true); } From 3471c664b32f76c9dc709caabf9a42c9d9324aed Mon Sep 17 00:00:00 2001 From: Francesco Degrassi Date: Sat, 15 Oct 2022 00:20:17 +0200 Subject: [PATCH 2/3] :enh: ensure only full records are flushed from Writer to inner Write when using write_byte_record --- src/writer.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/writer.rs b/src/writer.rs index 0577c60..1636214 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -954,7 +954,10 @@ impl Writer { // The maximum number of bytes for the terminator. + 2; if self.buf.writable().len() < upper_bound { - return self.write_record(record); + // Flush before writing a record, so we only flush on whole records + self.flush_buf()?; + // Fail if we cannot free enough buffer space + assert!(self.buf.writable().len() >= upper_bound, "Not enough buffer space"); } let mut first = true; for field in record.iter() { From e34403cddee7957b691f952379db886a065ed588 Mon Sep 17 00:00:00 2001 From: Francesco Degrassi Date: Wed, 30 Nov 2022 16:38:40 +0100 Subject: [PATCH 3/3] :new: allow to peek at the inner writer of a csv Writer --- src/writer.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/writer.rs b/src/writer.rs index 1636214..b9103d8 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1085,6 +1085,11 @@ impl Writer { } } + /// Return an immutable reference to the underlying writer. + pub fn inner(&self) -> &W { + self.wtr.as_ref().expect("Called inner() with a None self.wtr") + } + /// Write a CSV delimiter. fn write_delimiter(&mut self) -> Result<()> { loop {