Skip to content

Commit

Permalink
Adds ExcelDataReaderOptions.IgnoreTrailingEmptyRows. (#168)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkPflug authored May 13, 2024
1 parent 203f664 commit 489a81e
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 23 deletions.
3 changes: 3 additions & 0 deletions docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Sylvan.Data.Excel Release Notes

_0.4.22_
- Adds `ExcelDataReaderOptions.IgnoreEmptyTrailingRows` option.

_0.4.21_
- Fixes a bug that could cause incorrect behavior when reading certain .xls files.

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,42 @@ public async Task AllFieldAccessorsThrowWhenInvalid()
Assert.Throws<InvalidOperationException>(() => edr.GetValues(values));
Assert.Throws<InvalidOperationException>(() => edr.IsDBNull(0));
await Assert.ThrowsAsync<InvalidOperationException>(async () => await edr.IsDBNullAsync(0));
}

[Fact]
public void TrailingBlank()
{
var file = GetFile("TrailingBlank");
using var edr = ExcelDataReader.Create(file);
Assert.True(edr.Read());
Assert.Equal("b", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("c", edr.GetString(0));
var x = edr.Read();
var y = edr.RowFieldCount;
}

[Fact]
public void TrailingBlankNoSkip()
{
var file = GetFile("TrailingBlank");
var opt = new ExcelDataReaderOptions { IgnoreEmptyTrailingRows = false };
using var edr = ExcelDataReader.Create(file, opt);
Assert.True(edr.Read());
Assert.Equal("b", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("c", edr.GetString(0));
while(edr.RowNumber < 10)
{
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.Equal(0, edr.RowFieldCount);
}
Assert.False(edr.Read());
}

#if ASYNC
Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public void Simple()
{
w.Write(reader);
}
Open(f);
//Open(f);
Validate(f);
}

Expand Down
9 changes: 9 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ private protected struct FieldInfo
public DateTime dtValue;
public int xfIdx;


internal bool IsEmptyValue
{
get
{
return this.type == ExcelDataType.Null || (this.type == ExcelDataType.String && this.strValue?.Length == 0);
}
}

internal ExcelErrorCode ErrorCode
{
get { return (ExcelErrorCode)numValue; }
Expand Down
2 changes: 2 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ internal enum DateMode
static readonly DateTime Epoch1904 = new DateTime(1904, 1, 1);

private protected DateMode dateMode;
private protected readonly bool ignoreEmptyTrailingRows;

readonly string? trueString;
readonly string? falseString;
Expand Down Expand Up @@ -102,6 +103,7 @@ private protected ExcelDataReader(Stream stream, ExcelDataReaderOptions options)
this.culture = options.Culture;
this.dateTimeFormat = options.DateTimeFormat;
this.ownsStream = options.OwnsStream;
this.ignoreEmptyTrailingRows = options.IgnoreEmptyTrailingRows;
}

#if ASYNC
Expand Down
15 changes: 15 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,23 @@ public ExcelDataReaderOptions()
{
this.Schema = ExcelSchema.Default;
this.Culture = CultureInfo.InvariantCulture;
this.IgnoreEmptyTrailingRows = true;
}

/// <summary>
/// Indicates that any trailing rows with empty cells should be ignored.
/// Defaults to true.
/// </summary>
/// <remarks>
/// By default, ExcelDataReader will ignore any rows at the end of the file that
/// contain only empty cells. Skipping the empty rows requires reading data until either a non-empty
/// row is found, or the end of the file. In some situations, this can require reading a million rows
/// only to discover that there is no more data, which takes a measurable amount of time. This option
/// allows consumers to have more control, and optionally stop reading when the first empty row is found
/// with the expectation that it marks the end of the data.
/// </remarks>
public bool IgnoreEmptyTrailingRows { get; set; }

/// <summary>
/// Gets or sets the schema for the data in the workbook.
/// </summary>
Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<TargetFrameworks>net6.0;netstandard2.1;netstandard2.0</TargetFrameworks>
<LangVersion>latest</LangVersion>
<VersionPrefix>0.4.21</VersionPrefix>
<VersionPrefix>0.4.22</VersionPrefix>
<Description>A cross-platform .NET library for reading Excel data files.</Description>
<PackageTags>excel;xls;xlsx;xlsb;datareader</PackageTags>
<Nullable>enable</Nullable>
Expand Down
55 changes: 42 additions & 13 deletions source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public XlsSheetInfo(string name, int offset, bool hidden) : base(name, hidden)
int curFieldCount = 0;
int pendingRow = -1;

int rowCellCount = 0;

internal XlsWorkbookReader(Stream stream, ExcelDataReaderOptions options) : base(stream, options)
{
var pkg = new Ole2Package(stream);
Expand Down Expand Up @@ -88,14 +90,20 @@ public override bool Read()
}
rowIndex++;

if (NextRow())
var count = NextRow();

if (count < 0)
{
return true;
if (this.rowCellCount > 0 && this.ignoreEmptyTrailingRows == false)
{
return true;
}
this.state = State.End;
return false;
}
else
{
this.state = State.End;
return false;
return true;
}
}

Expand Down Expand Up @@ -379,7 +387,10 @@ void ParseFormula()
SetRowData(colIdx, new FieldInfo((ExcelErrorCode)rval));
break;
default:
throw new InvalidDataException();
// this seems to indicate the function result is null,
// though the spec doesn't make this clear.
SetRowData(colIdx, new FieldInfo());
break;
}
}
else
Expand All @@ -398,33 +409,46 @@ void SetRowData(int colIdx, FieldInfo cd)
{
Array.Resize(ref values, Math.Max(8, values.Length * 2));
}
rowFieldCount = Math.Max(rowFieldCount, colIdx + 1);
if (!cd.IsEmptyValue)
{
this.rowFieldCount = Math.Max(rowFieldCount, colIdx + 1);
}
this.rowCellCount++;
values[colIdx] = cd;
}


bool NextRow()
int NextRow()
{
// clear out any fields from previous row
Array.Clear(this.values, 0, this.values.Length);
// rowFieldCount records the last non-empty cell.
this.rowFieldCount = 0;
// rowCellCount records the number of cells that have any (even empty string) values
this.rowCellCount = 0;

do
{
if (pendingRow == -1)
{
if (!reader.NextRecord())
{
return false;
// reached the end of the records stream before finding any more cells
return -1;
}
}

if (rowIndex < pendingRow)
{
return true;
// the current row is empty but there is more data after.
return 0;
}

pendingRow = -1;

// this first switch is only concerned with "peeking" at the next cell record
// to determine if it is for the current row (rowIndex), or if the current row
// is empty where the next cell is for a subsequent row.
switch (reader.Type)
{
case RecordType.LabelSST:
Expand All @@ -439,8 +463,9 @@ bool NextRow()
{
if (this.rowIndex < peekRow)
{
// the current row is empty but we've seen a cell for a subsequent row.
pendingRow = peekRow;
return true;
return 0;
}
else
{
Expand All @@ -451,14 +476,18 @@ bool NextRow()
case RecordType.EOF:
if (this.rowFieldCount > 0)
{
// we've reached the end of the data stream
// and have cells in the current row
if (pendingRow == int.MinValue)
{
return false;
return -1;
}
else
{
// set pending row such that we will come back to return -1
// the next time we read a row.
pendingRow = int.MinValue;
return true;
return 0;
}
}
break;
Expand Down Expand Up @@ -499,7 +528,7 @@ bool NextRow()
// this should only apply to formulas, and is handled inline
break;
case RecordType.EOF:
return this.RowFieldCount > 0;
return this.rowFieldCount == 0 ? -1 : this.rowFieldCount;
default:
break;
}
Expand Down
17 changes: 11 additions & 6 deletions source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ bool InitializeSheet()
if (parsedRowIndex > 0)
{
this.rowFieldCount = 0;
}
}

this.state = State.Initialized;
this.rowIndex = LoadSchema() ? -1 : 0;
Expand Down Expand Up @@ -266,7 +266,7 @@ private protected override string GetSharedString(int idx)
public override bool Read()
{
rowIndex++;
start:
start:
if (state == State.Open)
{
if (rowIndex <= parsedRowIndex)
Expand All @@ -293,7 +293,12 @@ public override bool Read()
}
if (c == 0)
{
continue;
if (this.ignoreEmptyTrailingRows)
{
continue;
}

this.rowFieldCount = 0;
}
if (rowIndex < parsedRowIndex)
{
Expand Down Expand Up @@ -436,17 +441,16 @@ static void EnsureCols(ref FieldInfo[] values, int c)
case RecordType.CellIsst:
type = ExcelDataType.String;
var sstIdx = reader.GetInt32(8);

fi.isSS = true;
fi.ssIdx = sstIdx;
//fi.strValue = sst[sstIdx];
notNull++;
break;
case RecordType.CellSt:
case RecordType.CellFmlaString:
type = ExcelDataType.String;
fi.strValue = reader.GetString(8);
notNull++;
if (fi.strValue.Length > 0)
notNull++;
break;
case RecordType.CellFmlaNum:
type = ExcelDataType.Numeric;
Expand All @@ -455,6 +459,7 @@ static void EnsureCols(ref FieldInfo[] values, int c)
break;
}


fi.type = type;
fi.xfIdx = sf;
count = col + 1;
Expand Down
9 changes: 7 additions & 2 deletions source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,13 @@ public override bool Read()
var c = ParseRowValues();
if (c == 0)
{
// handles trailing empty rows.
continue;
if (this.ignoreEmptyTrailingRows)
{
// handles trailing empty rows.
continue;
}

this.rowFieldCount = 0;
}
if (rowIndex < parsedRowIndex)
{
Expand Down

0 comments on commit 489a81e

Please sign in to comment.