Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds ExcelDataReaderOptions.IgnoreEmptyTrailingRows. #168

Merged
merged 3 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Sylvan.Data.Excel Release Notes

_0.4.22_
- Adds `ExcelDataReaderOptions.IgnoreEmptyTrailingRows` option.

_0.4.21_
- Fixes a bug that could cause incorrect behavior when reading certain .xls files.

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions source/Sylvan.Data.Excel.Tests/ExcelDataReaderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,42 @@ public async Task AllFieldAccessorsThrowWhenInvalid()
Assert.Throws<InvalidOperationException>(() => edr.GetValues(values));
Assert.Throws<InvalidOperationException>(() => edr.IsDBNull(0));
await Assert.ThrowsAsync<InvalidOperationException>(async () => await edr.IsDBNullAsync(0));
}

[Fact]
public void TrailingBlank()
{
var file = GetFile("TrailingBlank");
using var edr = ExcelDataReader.Create(file);
Assert.True(edr.Read());
Assert.Equal("b", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("c", edr.GetString(0));
var x = edr.Read();
var y = edr.RowFieldCount;
}

[Fact]
public void TrailingBlankNoSkip()
{
var file = GetFile("TrailingBlank");
var opt = new ExcelDataReaderOptions { IgnoreEmptyTrailingRows = false };
using var edr = ExcelDataReader.Create(file, opt);
Assert.True(edr.Read());
Assert.Equal("b", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.True(edr.Read());
Assert.Equal("c", edr.GetString(0));
while(edr.RowNumber < 10)
{
Assert.True(edr.Read());
Assert.Equal("", edr.GetString(0));
Assert.Equal(0, edr.RowFieldCount);
}
Assert.False(edr.Read());
}

#if ASYNC
Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public void Simple()
{
w.Write(reader);
}
Open(f);
//Open(f);
Validate(f);
}

Expand Down
9 changes: 9 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ private protected struct FieldInfo
public DateTime dtValue;
public int xfIdx;


internal bool IsEmptyValue
{
get
{
return this.type == ExcelDataType.Null || (this.type == ExcelDataType.String && this.strValue?.Length == 0);
}
}

internal ExcelErrorCode ErrorCode
{
get { return (ExcelErrorCode)numValue; }
Expand Down
2 changes: 2 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ internal enum DateMode
static readonly DateTime Epoch1904 = new DateTime(1904, 1, 1);

private protected DateMode dateMode;
private protected readonly bool ignoreEmptyTrailingRows;

readonly string? trueString;
readonly string? falseString;
Expand Down Expand Up @@ -102,6 +103,7 @@ private protected ExcelDataReader(Stream stream, ExcelDataReaderOptions options)
this.culture = options.Culture;
this.dateTimeFormat = options.DateTimeFormat;
this.ownsStream = options.OwnsStream;
this.ignoreEmptyTrailingRows = options.IgnoreEmptyTrailingRows;
}

#if ASYNC
Expand Down
15 changes: 15 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReaderOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,23 @@ public ExcelDataReaderOptions()
{
this.Schema = ExcelSchema.Default;
this.Culture = CultureInfo.InvariantCulture;
this.IgnoreEmptyTrailingRows = true;
}

/// <summary>
/// Indicates that any trailing rows with empty cells should be ignored.
/// Defaults to true.
/// </summary>
/// <remarks>
/// By default, ExcelDataReader will ignore any rows at the end of the file that
/// contain only empty cells. Skipping the empty rows requires reading data until either a non-empty
/// row is found, or the end of the file. In some situations, this can require reading a million rows
/// only to discover that there is no more data, which takes a measurable amount of time. This option
/// allows consumers to have more control, and optionally stop reading when the first empty row is found
/// with the expectation that it marks the end of the data.
/// </remarks>
public bool IgnoreEmptyTrailingRows { get; set; }

/// <summary>
/// Gets or sets the schema for the data in the workbook.
/// </summary>
Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<TargetFrameworks>net6.0;netstandard2.1;netstandard2.0</TargetFrameworks>
<LangVersion>latest</LangVersion>
<VersionPrefix>0.4.21</VersionPrefix>
<VersionPrefix>0.4.22</VersionPrefix>
<Description>A cross-platform .NET library for reading Excel data files.</Description>
<PackageTags>excel;xls;xlsx;xlsb;datareader</PackageTags>
<Nullable>enable</Nullable>
Expand Down
55 changes: 42 additions & 13 deletions source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public XlsSheetInfo(string name, int offset, bool hidden) : base(name, hidden)
int curFieldCount = 0;
int pendingRow = -1;

int rowCellCount = 0;

internal XlsWorkbookReader(Stream stream, ExcelDataReaderOptions options) : base(stream, options)
{
var pkg = new Ole2Package(stream);
Expand Down Expand Up @@ -88,14 +90,20 @@ public override bool Read()
}
rowIndex++;

if (NextRow())
var count = NextRow();

if (count < 0)
{
return true;
if (this.rowCellCount > 0 && this.ignoreEmptyTrailingRows == false)
{
return true;
}
this.state = State.End;
return false;
}
else
{
this.state = State.End;
return false;
return true;
}
}

Expand Down Expand Up @@ -379,7 +387,10 @@ void ParseFormula()
SetRowData(colIdx, new FieldInfo((ExcelErrorCode)rval));
break;
default:
throw new InvalidDataException();
// this seems to indicate the function result is null,
// though the spec doesn't make this clear.
SetRowData(colIdx, new FieldInfo());
break;
}
}
else
Expand All @@ -398,33 +409,46 @@ void SetRowData(int colIdx, FieldInfo cd)
{
Array.Resize(ref values, Math.Max(8, values.Length * 2));
}
rowFieldCount = Math.Max(rowFieldCount, colIdx + 1);
if (!cd.IsEmptyValue)
{
this.rowFieldCount = Math.Max(rowFieldCount, colIdx + 1);
}
this.rowCellCount++;
values[colIdx] = cd;
}


bool NextRow()
int NextRow()
{
// clear out any fields from previous row
Array.Clear(this.values, 0, this.values.Length);
// rowFieldCount records the last non-empty cell.
this.rowFieldCount = 0;
// rowCellCount records the number of cells that have any (even empty string) values
this.rowCellCount = 0;

do
{
if (pendingRow == -1)
{
if (!reader.NextRecord())
{
return false;
// reached the end of the records stream before finding any more cells
return -1;
}
}

if (rowIndex < pendingRow)
{
return true;
// the current row is empty but there is more data after.
return 0;
}

pendingRow = -1;

// this first switch is only concerned with "peeking" at the next cell record
// to determine if it is for the current row (rowIndex), or if the current row
// is empty where the next cell is for a subsequent row.
switch (reader.Type)
{
case RecordType.LabelSST:
Expand All @@ -439,8 +463,9 @@ bool NextRow()
{
if (this.rowIndex < peekRow)
{
// the current row is empty but we've seen a cell for a subsequent row.
pendingRow = peekRow;
return true;
return 0;
}
else
{
Expand All @@ -451,14 +476,18 @@ bool NextRow()
case RecordType.EOF:
if (this.rowFieldCount > 0)
{
// we've reached the end of the data stream
// and have cells in the current row
if (pendingRow == int.MinValue)
{
return false;
return -1;
}
else
{
// set pending row such that we will come back to return -1
// the next time we read a row.
pendingRow = int.MinValue;
return true;
return 0;
}
}
break;
Expand Down Expand Up @@ -499,7 +528,7 @@ bool NextRow()
// this should only apply to formulas, and is handled inline
break;
case RecordType.EOF:
return this.RowFieldCount > 0;
return this.rowFieldCount == 0 ? -1 : this.rowFieldCount;
default:
break;
}
Expand Down
17 changes: 11 additions & 6 deletions source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ bool InitializeSheet()
if (parsedRowIndex > 0)
{
this.rowFieldCount = 0;
}
}

this.state = State.Initialized;
this.rowIndex = LoadSchema() ? -1 : 0;
Expand Down Expand Up @@ -266,7 +266,7 @@ private protected override string GetSharedString(int idx)
public override bool Read()
{
rowIndex++;
start:
start:
if (state == State.Open)
{
if (rowIndex <= parsedRowIndex)
Expand All @@ -293,7 +293,12 @@ public override bool Read()
}
if (c == 0)
{
continue;
if (this.ignoreEmptyTrailingRows)
{
continue;
}

this.rowFieldCount = 0;
}
if (rowIndex < parsedRowIndex)
{
Expand Down Expand Up @@ -436,17 +441,16 @@ static void EnsureCols(ref FieldInfo[] values, int c)
case RecordType.CellIsst:
type = ExcelDataType.String;
var sstIdx = reader.GetInt32(8);

fi.isSS = true;
fi.ssIdx = sstIdx;
//fi.strValue = sst[sstIdx];
notNull++;
break;
case RecordType.CellSt:
case RecordType.CellFmlaString:
type = ExcelDataType.String;
fi.strValue = reader.GetString(8);
notNull++;
if (fi.strValue.Length > 0)
notNull++;
break;
case RecordType.CellFmlaNum:
type = ExcelDataType.Numeric;
Expand All @@ -455,6 +459,7 @@ static void EnsureCols(ref FieldInfo[] values, int c)
break;
}


fi.type = type;
fi.xfIdx = sf;
count = col + 1;
Expand Down
9 changes: 7 additions & 2 deletions source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,13 @@ public override bool Read()
var c = ParseRowValues();
if (c == 0)
{
// handles trailing empty rows.
continue;
if (this.ignoreEmptyTrailingRows)
{
// handles trailing empty rows.
continue;
}

this.rowFieldCount = 0;
}
if (rowIndex < parsedRowIndex)
{
Expand Down
Loading