Skip to content

Commit

Permalink
Lazy sst (#135)
Browse files Browse the repository at this point in the history
Make shared string loading lazy and only happen when the string is accessed.
  • Loading branch information
MarkPflug authored Sep 26, 2023
1 parent eae44e7 commit aa001a2
Show file tree
Hide file tree
Showing 10 changed files with 222 additions and 97 deletions.
2 changes: 2 additions & 0 deletions docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
_0.4.17_
- Exclude phonetic component when reading string values.
- Allow invalid ref values, which Excel appears to treat as missing.
- SharedString tables are read lazily instead of eagerly for .xlsx and .xlsb files, allowing
faster access to early records in some cases.

_0.4.16_
- Adds ExcelFileType class that exposes constants about supported Excel formats: extensions and content types.
Expand Down
15 changes: 13 additions & 2 deletions source/Sylvan.Data.Excel.Tests/CustomTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,25 @@ public void EmptyInlineStr()
[Fact]
public void EmptyTrailingRow()
{
// If the final (or trailing) row contains a shared string referencing
// an empty string, treat it as a null/empty value.
// In the case that there is a trailing row that references empty strings
// we will read it as a valid row. This should be uncommon enough that it won't affect anyone
// in practice.

var reader = XlsxBuilder.Create(TestData.EmptySSTrailingRow, TestData.SharedStringEmpty);
Assert.True(reader.Read());
Assert.Equal(3, reader.RowFieldCount);
Assert.Equal("a", reader.GetString(0));
Assert.Equal("a", reader.GetString(1));

Assert.True(reader.Read());

for (int i = 0; i < reader.RowFieldCount; i++)
{
Assert.True(reader.IsDBNull(i));
Assert.Equal("", reader.GetString(i));

}

Assert.False(reader.Read());
}

Expand Down
2 changes: 1 addition & 1 deletion source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public void Violence()
{
w.Write(reader);
}
Open(f);
//Open(f);
Validate(f);
}

Expand Down
41 changes: 41 additions & 0 deletions source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,47 @@ public static IEnumerable<object[]> GetInputs()
}
}


[Fact]
public void AnalyzeFiles()
{
var root = Environment.GetEnvironmentVariable("SylvanExcelTestData");
if (string.IsNullOrEmpty(root))
return;
var files = Directory.EnumerateFiles(root, "*.xlsx");
foreach (var file in files)
{
AnalyzeFile(file);
}
}

void AnalyzeFile(string file)
{
try
{
//using var s = File.OpenRead(file);
//using var za = new ZipArchive(s, ZipArchiveMode.Read);
var edr = ExcelDataReader.Create(file);
while (edr.Read())
{
for (int i = 0; i < edr.RowFieldCount; i++)
{
if (edr.GetExcelDataType(i) == ExcelDataType.String)
{
if (edr.GetString(i) == "")
{
o.WriteLine($"{Path.GetFileName(file)} {edr.RowNumber} {i}");
}
}
}
}
}
catch (Exception e)
{
o.WriteLine($"{Path.GetFileName(file)} ERROR {e.Message}");
}
}

[Fact]
public void XmlCharRegex()
{
Expand Down
2 changes: 2 additions & 0 deletions source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ private protected struct FieldInfo
public static readonly FieldInfo Null = default;

public ExcelDataType type;
public bool isSS;
public string? strValue;
public int ssIdx;
public double numValue;
public DateTime dtValue;
public int xfIdx;
Expand Down
14 changes: 10 additions & 4 deletions source/Sylvan.Data.Excel/ExcelDataReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
using System.Collections;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.ComponentModel.Design;
using System.Data;
using System.Data.Common;
using System.Globalization;
Expand Down Expand Up @@ -857,9 +856,16 @@ public sealed override string GetString(int ordinal)
case ExcelDataType.Numeric:
return FormatVal(fi.xfIdx, fi.numValue);
}
return fi.strValue ?? string.Empty;
return ProcString(in fi);
}

string ProcString(in FieldInfo fi)
{
return (fi.isSS ? GetSharedString(fi.ssIdx) : fi.strValue) ?? string.Empty;
}

private protected abstract string GetSharedString(int idx);

string FormatVal(int xfIdx, double val)
{
var fmtIdx = xfIdx >= this.xfMap.Length ? -1 : this.xfMap[xfIdx];
Expand Down Expand Up @@ -891,7 +897,7 @@ public sealed override double GetDouble(int ordinal)
switch (cell.type)
{
case ExcelDataType.String:
return double.Parse(cell.strValue!, culture);
return double.Parse(ProcString(in cell), culture);
case ExcelDataType.Numeric:
return cell.numValue;
case ExcelDataType.Error:
Expand Down Expand Up @@ -924,7 +930,7 @@ public sealed override bool GetBoolean(int ordinal)
var trueString = col?.TrueString ?? this.trueString;
var falseString = col?.FalseString ?? this.falseString;

var strVal = fi.strValue;
var strVal = ProcString(in fi);
var c = StringComparer.OrdinalIgnoreCase;

if (trueString != null && c.Equals(strVal, trueString))
Expand Down
1 change: 0 additions & 1 deletion source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
<TargetFrameworks>net6.0;netstandard2.1;netstandard2.0</TargetFrameworks>
<LangVersion>latest</LangVersion>
<VersionPrefix>0.4.17</VersionPrefix>
<VersionSuffix>b0003</VersionSuffix>
<Description>A cross-platform .NET library for reading Excel data files.</Description>
<PackageTags>excel;xls;xlsx;xlsb;datareader</PackageTags>
<Nullable>enable</Nullable>
Expand Down
6 changes: 6 additions & 0 deletions source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ public override bool Read()
return NextRow();
}

private protected override string GetSharedString(int idx)
{
// .xls eagerly loads the shared strings.
return sst[idx];
}

public override int MaxFieldCount => 256;

BOFType ReadBOF()
Expand Down
68 changes: 46 additions & 22 deletions source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,17 @@ sealed class XlsbWorkbookReader : ExcelDataReader
int parsedRowIndex = -1;
int curFieldCount = -1;

readonly ZipArchiveEntry? sstPart;
Stream? sstStream;
RecordReader? sstReader;
int sstIdx = -1;

public override ExcelWorkbookType WorkbookType => ExcelWorkbookType.ExcelXml;

public override void Close()
{
this.sheetStream?.Close();
this.sstStream?.Close();
base.Close();
}

Expand All @@ -53,7 +59,7 @@ public XlsbWorkbookReader(Stream stream, ExcelDataReaderOptions opts) : base(str

var stylePart = package.GetEntry(stylesPartName);

sst = ReadSharedStrings(sharedStringsPartName);
this.sstPart = package.GetEntry(sharedStringsPartName);

var sheetNameList = new List<SheetInfo>();
using (Stream sheetsStream = workbookPart.Open())
Expand Down Expand Up @@ -213,41 +219,57 @@ bool InitializeSheet()
return true;
}

string[] ReadSharedStrings(string sharedStringsPartName)
bool LoadSst(int idx)
{
var ssPart = package.GetEntry(sharedStringsPartName);
if (ssPart == null)
var reader = this.sstReader;
if (sstPart == null)
{
return Array.Empty<string>();
return false;
}
using (var stream = ssPart.Open())
if (reader == null)
{
var reader = new RecordReader(stream);

this.sstStream = sstPart.Open();
reader = this.sstReader = new RecordReader(this.sstStream);
reader.NextRecord();
if (reader.RecordType != RecordType.SSTBegin)
throw new InvalidDataException();

int totalCount = reader.GetInt32(0);
int count = reader.GetInt32(4);

var ss = new string[count];
if (count > 128)
count = 128;
this.sst = new string[count];
}
while (idx > this.sstIdx)
{
if (!reader.NextRecord() || reader.RecordType != RecordType.SSTItem)
{
throw new InvalidDataException();
}

for (int i = 0; i < count; i++)
var flags = reader.GetByte(0);
var str = reader.GetString(1);
this.sstIdx++;
if (sstIdx >= this.sst.Length)
{
reader.NextRecord();
if (reader.RecordType != RecordType.SSTItem)
{
reader.DebugInfo("fail");
throw new InvalidDataException();
}
Array.Resize(ref sst, sst.Length * 2);
}
sst[sstIdx] = str;
}
return true;
}

var flags = reader.GetByte(0);
var str = reader.GetString(1);
ss[i] = str;
private protected override string GetSharedString(int idx)
{
if (this.sstIdx < idx)
{
if (!LoadSst(idx))
{
throw new InvalidDataException();
}
return ss;
}
return sst[idx];
}

public override bool Read()
Expand Down Expand Up @@ -427,7 +449,10 @@ static void EnsureCols(ref FieldInfo[] values, int c)
case RecordType.CellIsst:
type = ExcelDataType.String;
var sstIdx = reader.GetInt32(8);
fi.strValue = sst[sstIdx];

fi.isSS = true;
fi.ssIdx = sstIdx;
//fi.strValue = sst[sstIdx];
notNull++;
break;
case RecordType.CellSt:
Expand All @@ -443,7 +468,6 @@ static void EnsureCols(ref FieldInfo[] values, int c)
break;
}


fi.type = type;
fi.xfIdx = sf;
count = col + 1;
Expand Down
Loading

0 comments on commit aa001a2

Please sign in to comment.