Skip to content
This repository has been archived by the owner on Dec 21, 2023. It is now read-only.

Commit

Permalink
Fix C++ unit test failure, attempt #2 (#339)
Browse files Browse the repository at this point in the history
The `readsome` function doesn't behave the way I expected - "available"
means "already in the buffer", not "can be read from the stream", which
means typically it reads 0 bytes on a newly initialized stream (which
breaks skip_BOM). We need to use `get`, and after each call, make sure
the stream is still in a good state (or recover if it's not).

This one passes both the C++ unity_sarray.cxxtest and Python
test_sframe.py.
  • Loading branch information
znation authored Mar 6, 2018
1 parent 749abd9 commit 4bd7f46
Showing 1 changed file with 38 additions and 16 deletions.
54 changes: 38 additions & 16 deletions src/sframe/parallel_csv_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,52 @@ using fileio::file_status;
* Escape BOMs
*/
void skip_BOM(general_ifstream& fin) {
const size_t BOM_SIZE = 3;
char bom[BOM_SIZE];
char fChar, sChar, tChar;
size_t count = 0;
fChar = fin.get();
count = fin.gcount();
if (!fin.good()) {
if (count == 1) {
fin.putback(fChar);
fin.clear(std::ios_base::goodbit);
} else {
DASSERT_EQ(count, 0);
}
return;
}
// use readsome to avoid setting failbit on eof
// (.get sets failbit!)
auto size = fin.readsome(bom, sizeof(bom));
if (size != BOM_SIZE) {
DASSERT_LT(size, BOM_SIZE);
if (size == 2) {
fin.putback(bom[1]);
DASSERT_EQ(count, 1);
sChar = fin.get();
count = fin.gcount();
if (!fin.good()) {
fin.clear(std::ios_base::goodbit);
if (count == 1) {
fin.putback(sChar);
} else {
DASSERT_EQ(count, 0);
}
if (size >= 1) {
fin.putback(bom[0]);
fin.putback(fChar);
return;
}
DASSERT_EQ(count, 1);
tChar = fin.get();
count = fin.gcount();
if (fin.bad() || count == 0) {
fin.clear(std::ios_base::goodbit);
if (count == 1) {
fin.putback(tChar);
} else {
DASSERT_EQ(count, 0);
}
fin.putback(sChar);
fin.putback(fChar);
return;
}

bool isBOM = ((bom[0] == (char)0xEF) && (bom[1] == (char)0xBB) && (bom[2] == (char)0xBF));
DASSERT_EQ(count, 1);
bool isBOM = ((fChar == (char)0xEF) && (sChar == (char)0xBB) && (tChar == (char)0xBF));
if (!isBOM) {
fin.putback(bom[2]);
fin.putback(bom[1]);
fin.putback(bom[0]);
fin.putback(tChar);
fin.putback(sChar);
fin.putback(fChar);
}
}

Expand Down

0 comments on commit 4bd7f46

Please sign in to comment.