From 4bd7f4634b75c0d825f8503c06b12103660c092c Mon Sep 17 00:00:00 2001 From: Zach Nation Date: Mon, 5 Mar 2018 17:23:40 -0800 Subject: [PATCH] Fix C++ unit test failure, attempt #2 (#339) The `readsome` function doesn't behave the way I expected - "available" means "already in the buffer", not "can be read from the stream", which means typically it reads 0 bytes on a newly initialized stream (which breaks skip_BOM). We need to use `get`, and after each call, make sure the stream is still in a good state (or recover if it's not). This one passes both the C++ unity_sarray.cxxtest and Python test_sframe.py. --- src/sframe/parallel_csv_parser.cpp | 54 +++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/src/sframe/parallel_csv_parser.cpp b/src/sframe/parallel_csv_parser.cpp index 841fe9ea9b..a13a9a81dc 100644 --- a/src/sframe/parallel_csv_parser.cpp +++ b/src/sframe/parallel_csv_parser.cpp @@ -32,30 +32,52 @@ using fileio::file_status; * Escape BOMs */ void skip_BOM(general_ifstream& fin) { - const size_t BOM_SIZE = 3; - char bom[BOM_SIZE]; + char fChar, sChar, tChar; + size_t count = 0; + fChar = fin.get(); + count = fin.gcount(); if (!fin.good()) { + if (count == 1) { + fin.putback(fChar); + fin.clear(std::ios_base::goodbit); + } else { + DASSERT_EQ(count, 0); + } return; } - // use readsome to avoid setting failbit on eof - // (.get sets failbit!) - auto size = fin.readsome(bom, sizeof(bom)); - if (size != BOM_SIZE) { - DASSERT_LT(size, BOM_SIZE); - if (size == 2) { - fin.putback(bom[1]); + DASSERT_EQ(count, 1); + sChar = fin.get(); + count = fin.gcount(); + if (!fin.good()) { + fin.clear(std::ios_base::goodbit); + if (count == 1) { + fin.putback(sChar); + } else { + DASSERT_EQ(count, 0); } - if (size >= 1) { - fin.putback(bom[0]); + fin.putback(fChar); + return; + } + DASSERT_EQ(count, 1); + tChar = fin.get(); + count = fin.gcount(); + if (fin.bad() || count == 0) { + fin.clear(std::ios_base::goodbit); + if (count == 1) { + fin.putback(tChar); + } else { + DASSERT_EQ(count, 0); } + fin.putback(sChar); + fin.putback(fChar); return; } - - bool isBOM = ((bom[0] == (char)0xEF) && (bom[1] == (char)0xBB) && (bom[2] == (char)0xBF)); + DASSERT_EQ(count, 1); + bool isBOM = ((fChar == (char)0xEF) && (sChar == (char)0xBB) && (tChar == (char)0xBF)); if (!isBOM) { - fin.putback(bom[2]); - fin.putback(bom[1]); - fin.putback(bom[0]); + fin.putback(tChar); + fin.putback(sChar); + fin.putback(fChar); } }