diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index e299cde..f589cb4 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -1,5 +1,8 @@ # Sylvan.Data.Excel Release Notes +_0.4.22_ +- Adds `ExcelDataReaderOptions.IgnoreEmptyTrailingRows` option. + _0.4.21_ - Fixes a bug that could cause incorrect behavior when reading certain .xls files. diff --git a/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xls b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xls new file mode 100644 index 0000000..a58a809 Binary files /dev/null and b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xls differ diff --git a/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsb b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsb new file mode 100644 index 0000000..69ad6a5 Binary files /dev/null and b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsb differ diff --git a/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsx b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsx new file mode 100644 index 0000000..824e325 Binary files /dev/null and b/source/Sylvan.Data.Excel.Tests/Data/TrailingBlank.xlsx differ diff --git a/source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs b/source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs index a85b4b4..997402c 100644 --- a/source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs +++ b/source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs @@ -1423,7 +1423,42 @@ public async Task AllFieldAccessorsThrowWhenInvalid() Assert.Throws(() => edr.GetValues(values)); Assert.Throws(() => edr.IsDBNull(0)); await Assert.ThrowsAsync(async () => await edr.IsDBNullAsync(0)); + } + + [Fact] + public void TrailingBlank() + { + var file = GetFile("TrailingBlank"); + using var edr = ExcelDataReader.Create(file); + Assert.True(edr.Read()); + Assert.Equal("b", edr.GetString(0)); + Assert.True(edr.Read()); + Assert.Equal("", edr.GetString(0)); + Assert.True(edr.Read()); + Assert.Equal("c", edr.GetString(0)); + var x = edr.Read(); + var y = edr.RowFieldCount; + } + [Fact] + public void TrailingBlankNoSkip() + { + var file = GetFile("TrailingBlank"); + var opt = new ExcelDataReaderOptions { IgnoreEmptyTrailingRows = false }; + using var edr = ExcelDataReader.Create(file, opt); + Assert.True(edr.Read()); + Assert.Equal("b", edr.GetString(0)); + Assert.True(edr.Read()); + Assert.Equal("", edr.GetString(0)); + Assert.True(edr.Read()); + Assert.Equal("c", edr.GetString(0)); + while(edr.RowNumber < 10) + { + Assert.True(edr.Read()); + Assert.Equal("", edr.GetString(0)); + Assert.Equal(0, edr.RowFieldCount); + } + Assert.False(edr.Read()); } #if ASYNC diff --git a/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs b/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs index c98eece..a16908f 100644 --- a/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs +++ b/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs @@ -106,7 +106,7 @@ public void Simple() { w.Write(reader); } - Open(f); + //Open(f); Validate(f); } diff --git a/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs b/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs index ad89791..8ad42dc 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs @@ -16,6 +16,15 @@ private protected struct FieldInfo public DateTime dtValue; public int xfIdx; + + internal bool IsEmptyValue + { + get + { + return this.type == ExcelDataType.Null || (this.type == ExcelDataType.String && this.strValue?.Length == 0); + } + } + internal ExcelErrorCode ErrorCode { get { return (ExcelErrorCode)numValue; } diff --git a/source/Sylvan.Data.Excel/ExcelDataReader.cs b/source/Sylvan.Data.Excel/ExcelDataReader.cs index cc56d4d..295c7fe 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReader.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReader.cs @@ -56,6 +56,7 @@ internal enum DateMode static readonly DateTime Epoch1904 = new DateTime(1904, 1, 1); private protected DateMode dateMode; + private protected readonly bool ignoreEmptyTrailingRows; readonly string? trueString; readonly string? falseString; @@ -102,6 +103,7 @@ private protected ExcelDataReader(Stream stream, ExcelDataReaderOptions options) this.culture = options.Culture; this.dateTimeFormat = options.DateTimeFormat; this.ownsStream = options.OwnsStream; + this.ignoreEmptyTrailingRows = options.IgnoreEmptyTrailingRows; } #if ASYNC diff --git a/source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs b/source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs index 5bc4795..71873c5 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs @@ -16,8 +16,23 @@ public ExcelDataReaderOptions() { this.Schema = ExcelSchema.Default; this.Culture = CultureInfo.InvariantCulture; + this.IgnoreEmptyTrailingRows = true; } + /// + /// Indicates that any trailing rows with empty cells should be ignored. + /// Defaults to true. + /// + /// + /// By default, ExcelDataReader will ignore any rows at the end of the file that + /// contain only empty cells. Skipping the empty rows requires reading data until either a non-empty + /// row is found, or the end of the file. In some situations, this can require reading a million rows + /// only to discover that there is no more data, which takes a measurable amount of time. This option + /// allows consumers to have more control, and optionally stop reading when the first empty row is found + /// with the expectation that it marks the end of the data. + /// + public bool IgnoreEmptyTrailingRows { get; set; } + /// /// Gets or sets the schema for the data in the workbook. /// diff --git a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj index 233377a..76c2c8a 100644 --- a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj +++ b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj @@ -3,7 +3,7 @@ net6.0;netstandard2.1;netstandard2.0 latest - 0.4.21 + 0.4.22 A cross-platform .NET library for reading Excel data files. excel;xls;xlsx;xlsb;datareader enable diff --git a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs index 2705c52..b215ab1 100644 --- a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs @@ -27,6 +27,8 @@ public XlsSheetInfo(string name, int offset, bool hidden) : base(name, hidden) int curFieldCount = 0; int pendingRow = -1; + int rowCellCount = 0; + internal XlsWorkbookReader(Stream stream, ExcelDataReaderOptions options) : base(stream, options) { var pkg = new Ole2Package(stream); @@ -88,14 +90,20 @@ public override bool Read() } rowIndex++; - if (NextRow()) + var count = NextRow(); + + if (count < 0) { - return true; + if (this.rowCellCount > 0 && this.ignoreEmptyTrailingRows == false) + { + return true; + } + this.state = State.End; + return false; } else { - this.state = State.End; - return false; + return true; } } @@ -379,7 +387,10 @@ void ParseFormula() SetRowData(colIdx, new FieldInfo((ExcelErrorCode)rval)); break; default: - throw new InvalidDataException(); + // this seems to indicate the function result is null, + // though the spec doesn't make this clear. + SetRowData(colIdx, new FieldInfo()); + break; } } else @@ -398,33 +409,46 @@ void SetRowData(int colIdx, FieldInfo cd) { Array.Resize(ref values, Math.Max(8, values.Length * 2)); } - rowFieldCount = Math.Max(rowFieldCount, colIdx + 1); + if (!cd.IsEmptyValue) + { + this.rowFieldCount = Math.Max(rowFieldCount, colIdx + 1); + } + this.rowCellCount++; values[colIdx] = cd; } - bool NextRow() + int NextRow() { // clear out any fields from previous row Array.Clear(this.values, 0, this.values.Length); + // rowFieldCount records the last non-empty cell. this.rowFieldCount = 0; + // rowCellCount records the number of cells that have any (even empty string) values + this.rowCellCount = 0; + do { if (pendingRow == -1) { if (!reader.NextRecord()) { - return false; + // reached the end of the records stream before finding any more cells + return -1; } } if (rowIndex < pendingRow) { - return true; + // the current row is empty but there is more data after. + return 0; } pendingRow = -1; + // this first switch is only concerned with "peeking" at the next cell record + // to determine if it is for the current row (rowIndex), or if the current row + // is empty where the next cell is for a subsequent row. switch (reader.Type) { case RecordType.LabelSST: @@ -439,8 +463,9 @@ bool NextRow() { if (this.rowIndex < peekRow) { + // the current row is empty but we've seen a cell for a subsequent row. pendingRow = peekRow; - return true; + return 0; } else { @@ -451,14 +476,18 @@ bool NextRow() case RecordType.EOF: if (this.rowFieldCount > 0) { + // we've reached the end of the data stream + // and have cells in the current row if (pendingRow == int.MinValue) { - return false; + return -1; } else { + // set pending row such that we will come back to return -1 + // the next time we read a row. pendingRow = int.MinValue; - return true; + return 0; } } break; @@ -499,7 +528,7 @@ bool NextRow() // this should only apply to formulas, and is handled inline break; case RecordType.EOF: - return this.RowFieldCount > 0; + return this.rowFieldCount == 0 ? -1 : this.rowFieldCount; default: break; } diff --git a/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs b/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs index 1f60b85..ca798e4 100644 --- a/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs @@ -203,7 +203,7 @@ bool InitializeSheet() if (parsedRowIndex > 0) { this.rowFieldCount = 0; - } + } this.state = State.Initialized; this.rowIndex = LoadSchema() ? -1 : 0; @@ -266,7 +266,7 @@ private protected override string GetSharedString(int idx) public override bool Read() { rowIndex++; - start: + start: if (state == State.Open) { if (rowIndex <= parsedRowIndex) @@ -293,7 +293,12 @@ public override bool Read() } if (c == 0) { - continue; + if (this.ignoreEmptyTrailingRows) + { + continue; + } + + this.rowFieldCount = 0; } if (rowIndex < parsedRowIndex) { @@ -436,17 +441,16 @@ static void EnsureCols(ref FieldInfo[] values, int c) case RecordType.CellIsst: type = ExcelDataType.String; var sstIdx = reader.GetInt32(8); - fi.isSS = true; fi.ssIdx = sstIdx; - //fi.strValue = sst[sstIdx]; notNull++; break; case RecordType.CellSt: case RecordType.CellFmlaString: type = ExcelDataType.String; fi.strValue = reader.GetString(8); - notNull++; + if (fi.strValue.Length > 0) + notNull++; break; case RecordType.CellFmlaNum: type = ExcelDataType.Numeric; @@ -455,6 +459,7 @@ static void EnsureCols(ref FieldInfo[] values, int c) break; } + fi.type = type; fi.xfIdx = sf; count = col + 1; diff --git a/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs b/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs index d263ff9..bb46dc0 100644 --- a/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs @@ -405,8 +405,13 @@ public override bool Read() var c = ParseRowValues(); if (c == 0) { - // handles trailing empty rows. - continue; + if (this.ignoreEmptyTrailingRows) + { + // handles trailing empty rows. + continue; + } + + this.rowFieldCount = 0; } if (rowIndex < parsedRowIndex) {