From 37d771ba901d17b3cf274b731591c74af731a85d Mon Sep 17 00:00:00 2001 From: Graham Stewart Date: Tue, 15 Mar 2022 21:40:57 -0700 Subject: [PATCH 1/4] Refactored long methods append and quoteElement --- .../com/univocity/parsers/csv/CsvWriter.java | 128 ++++++------------ 1 file changed, 41 insertions(+), 87 deletions(-) diff --git a/src/main/java/com/univocity/parsers/csv/CsvWriter.java b/src/main/java/com/univocity/parsers/csv/CsvWriter.java index 5399b265..ff87c3f9 100644 --- a/src/main/java/com/univocity/parsers/csv/CsvWriter.java +++ b/src/main/java/com/univocity/parsers/csv/CsvWriter.java @@ -33,7 +33,6 @@ */ public class CsvWriter extends AbstractWriter { - private char delimiter; private char[] multiDelimiter; private char quoteChar; private char escapeChar; @@ -143,8 +142,8 @@ protected final void initialize(CsvWriterSettings settings) { CsvFormat format = settings.getFormat(); this.multiDelimiter = format.getDelimiterString().toCharArray(); if (multiDelimiter.length == 1) { - delimiter = multiDelimiter[0]; - multiDelimiter = null; + //delimiter = multiDelimiter[0]; + //multiDelimiter = null; } this.quoteChar = format.getQuote(); this.escapeChar = format.getQuoteEscape(); @@ -209,11 +208,7 @@ protected void processRow(Object[] row) { } for (int i = 0; i < row.length; i++) { if (i != 0) { - if (multiDelimiter == null) { - appendToRow(delimiter); - } else { - appendToRow(multiDelimiter); - } + appendToRow(multiDelimiter); } if (dontProcessNormalizedNewLines) { @@ -270,36 +265,18 @@ private boolean matchMultiDelimiter(String element, int from) { private boolean quoteElement(int start, String element) { final int length = element.length(); - if (multiDelimiter == null) { - if (maxTrigger == 0) { - for (int i = start; i < length; i++) { - char nextChar = element.charAt(i); - if (nextChar == delimiter || nextChar == newLine) { - return true; - } - } - } else { - for (int i = start; i < length; i++) { - char nextChar = element.charAt(i); - if (nextChar == delimiter || nextChar < maxTrigger && quotationTriggers[nextChar]) { - return true; - } + if (maxTrigger == 0) { + for (int i = start; i < length; i++) { + char nextChar = element.charAt(i); + if (delimiterChecker(nextChar, element, i)|| nextChar == newLine) { + return true; } } } else { - if (maxTrigger == 0) { - for (int i = start; i < length; i++) { - char nextChar = element.charAt(i); - if ((nextChar == multiDelimiter[0] && matchMultiDelimiter(element, i + 1)) || nextChar == newLine) { - return true; - } - } - } else { - for (int i = start; i < length; i++) { - char nextChar = element.charAt(i); - if ((nextChar == multiDelimiter[0] && matchMultiDelimiter(element, i + 1)) || nextChar < maxTrigger && quotationTriggers[nextChar]) { - return true; - } + for (int i = start; i < length; i++) { + char nextChar = element.charAt(i); + if (delimiterChecker(nextChar, element, i)|| nextChar < maxTrigger && quotationTriggers[nextChar]) { + return true; } } } @@ -342,63 +319,32 @@ private boolean append(int columnIndex, boolean isElementQuoted, boolean allowTr int i = start; char ch = '\0'; - if (multiDelimiter == null) { - for (; i < length; i++) { - ch = element.charAt(i); - if (ch == quoteChar || ch == delimiter || ch == escapeChar || (ch < maxTrigger && quotationTriggers[ch])) { - appender.append(element, start, i); - start = i + 1; - - if (ch == quoteChar || ch == escapeChar) { - if (quoteElement(i, element)) { - appendQuoted(i, allowTrim, element); - return true; - } else if (escapeUnquoted) { - appendQuoted(i, allowTrim, element); - } else { - appender.append(element, i, length); - if (allowTrim && ignoreTrailing && element.charAt(length - 1) <= ' ' && whitespaceRangeStart < element.charAt(length - 1)) { - appender.updateWhitespace(); - } - } - return isElementQuoted; - } else if (ch == escapeChar && inputNotEscaped && escapeEscape != '\0' && escapeUnquoted) { - appender.append(escapeEscape); - } else if (ch == delimiter || ch < maxTrigger && quotationTriggers[ch]) { + for (; i < length; i++) { + ch = element.charAt(i); + if (ch == quoteChar || delimiterChecker(ch, element, i) || ch == escapeChar || (ch < maxTrigger && quotationTriggers[ch])) { + appender.append(element, start, i); + start = i + 1; + + if (ch == quoteChar || ch == escapeChar) { + if (quoteElement(i, element)) { appendQuoted(i, allowTrim, element); return true; - } - appender.append(ch); - } - } - } else { - for (; i < length; i++) { - ch = element.charAt(i); - if (ch == quoteChar || (ch == multiDelimiter[0] && matchMultiDelimiter(element, i + 1)) || ch == escapeChar || (ch < maxTrigger && quotationTriggers[ch])) { - appender.append(element, start, i); - start = i + 1; - - if (ch == quoteChar || ch == escapeChar) { - if (quoteElement(i, element)) { - appendQuoted(i, allowTrim, element); - return true; - } else if (escapeUnquoted) { - appendQuoted(i, allowTrim, element); - } else { - appender.append(element, i, length); - if (allowTrim && ignoreTrailing && element.charAt(length - 1) <= ' ' && whitespaceRangeStart < element.charAt(length - 1)) { - appender.updateWhitespace(); - } - } - return isElementQuoted; - } else if (ch == escapeChar && inputNotEscaped && escapeEscape != '\0' && escapeUnquoted) { - appender.append(escapeEscape); - } else if ((ch == multiDelimiter[0] && matchMultiDelimiter(element, i + 1)) || ch < maxTrigger && quotationTriggers[ch]) { + } else if (escapeUnquoted) { appendQuoted(i, allowTrim, element); - return true; + } else { + appender.append(element, i, length); + if (allowTrim && ignoreTrailing && element.charAt(length - 1) <= ' ' && whitespaceRangeStart < element.charAt(length - 1)) { + appender.updateWhitespace(); + } } - appender.append(ch); + return isElementQuoted; + } else if (ch == escapeChar && inputNotEscaped && escapeEscape != '\0' && escapeUnquoted) { + appender.append(escapeEscape); + } else if (delimiterChecker(ch, element, i)|| ch < maxTrigger && quotationTriggers[ch]) { + appendQuoted(i, allowTrim, element); + return true; } + appender.append(ch); } } @@ -409,6 +355,14 @@ private boolean append(int columnIndex, boolean isElementQuoted, boolean allowTr return isElementQuoted; } + private boolean delimiterChecker(char ch, String element, int index) { + boolean multi = true; + if(multiDelimiter.length > 0) { + multi = matchMultiDelimiter(element, index + 1); + } + return ch == multiDelimiter[0] && multi; + } + private void appendQuoted(int start, boolean allowTrim, String element) { final int length = element.length(); int i = start; From 975b5378ecb068f821124f93fafa247a2622fff0 Mon Sep 17 00:00:00 2001 From: Graham Stewart Date: Wed, 16 Mar 2022 20:39:58 -0700 Subject: [PATCH 2/4] multiDelimiter.length > 0 --> .length > 1 --- src/main/java/com/univocity/parsers/csv/CsvWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/univocity/parsers/csv/CsvWriter.java b/src/main/java/com/univocity/parsers/csv/CsvWriter.java index ff87c3f9..0193841d 100644 --- a/src/main/java/com/univocity/parsers/csv/CsvWriter.java +++ b/src/main/java/com/univocity/parsers/csv/CsvWriter.java @@ -357,7 +357,7 @@ private boolean append(int columnIndex, boolean isElementQuoted, boolean allowTr private boolean delimiterChecker(char ch, String element, int index) { boolean multi = true; - if(multiDelimiter.length > 0) { + if(multiDelimiter.length > 1) { multi = matchMultiDelimiter(element, index + 1); } return ch == multiDelimiter[0] && multi; From 5d906aa1a1ec7ae530d85301d3628c2a188646a4 Mon Sep 17 00:00:00 2001 From: MustafaWasif Date: Fri, 18 Mar 2022 16:29:38 -0700 Subject: [PATCH 3/4] Refactor using Extract Methods --- .../parsers/fixed/FixedWidthParser.java | 127 +++++++++++------- 1 file changed, 77 insertions(+), 50 deletions(-) diff --git a/src/main/java/com/univocity/parsers/fixed/FixedWidthParser.java b/src/main/java/com/univocity/parsers/fixed/FixedWidthParser.java index b38bc991..291493f4 100644 --- a/src/main/java/com/univocity/parsers/fixed/FixedWidthParser.java +++ b/src/main/java/com/univocity/parsers/fixed/FixedWidthParser.java @@ -149,64 +149,92 @@ protected void parseRecord() { lookaheadInput.lookahead(maxLookupLength); if (lookaheadFormats != null) { - for (int i = 0; i < lookaheadFormats.length; i++) { - if (lookaheadInput.matches(ch, lookaheadFormats[i].value, wildcard)) { - lengths = lookaheadFormats[i].lengths; - alignments = lookaheadFormats[i].alignments; - paddings = lookaheadFormats[i].paddings; - ignore = lookaheadFormats[i].ignore; - keepPaddingFlags = lookaheadFormats[i].keepPaddingFlags; - lookupFormat = lookaheadFormats[i]; - matched = true; - break; - } - } + lookAheadFormatsNotNull(lookaheadInput, matched); + matched = true; + if (lookbehindFormats != null && matched) { - lookbehindFormat = null; - for (int i = 0; i < lookbehindFormats.length; i++) { - if (lookaheadInput.matches(ch, lookbehindFormats[i].value, wildcard)) { - lookbehindFormat = lookbehindFormats[i]; - break; - } - } + lookBehindFormatsNotNull(lookaheadInput); } } else { - for (int i = 0; i < lookbehindFormats.length; i++) { - if (lookaheadInput.matches(ch, lookbehindFormats[i].value, wildcard)) { - lookbehindFormat = lookbehindFormats[i]; - matched = true; - lengths = rootLengths; - ignore = rootIgnore; - keepPaddingFlags = rootKeepPaddingFlags; - break; - } - } + lookAheadFormatsNull(lookaheadInput, matched); } if (!matched) { - if (lookbehindFormat == null) { - if (rootLengths == null) { - throw new TextParsingException(context, "Cannot process input with the given configuration. No default field lengths defined and no lookahead/lookbehind value match '" + lookaheadInput.getLookahead(ch) + '\''); - } - lengths = rootLengths; - alignments = rootAlignments; - paddings = rootPaddings; - ignore = rootIgnore; - keepPaddingFlags = rootKeepPaddingFlags; - lookupFormat = null; - } else { - lengths = lookbehindFormat.lengths; - alignments = lookbehindFormat.alignments; - paddings = lookbehindFormat.paddings; - ignore = lookbehindFormat.ignore; - keepPaddingFlags = lookbehindFormat.keepPaddingFlags; - lookupFormat = lookbehindFormat; - } + unmatched(); + } + } + + process(); + } + + //New Method (Extract Method) + private void unmatched(){ + if (lookbehindFormat == null) { + if (rootLengths == null) { + throw new TextParsingException(context, "Cannot process input with the given configuration. No default field lengths defined and no lookahead/lookbehind value match '" + lookaheadInput.getLookahead(ch) + '\''); + } + lengths = rootLengths; + alignments = rootAlignments; + paddings = rootPaddings; + ignore = rootIgnore; + keepPaddingFlags = rootKeepPaddingFlags; + lookupFormat = null; + } else { + lengths = lookbehindFormat.lengths; + alignments = lookbehindFormat.alignments; + paddings = lookbehindFormat.paddings; + ignore = lookbehindFormat.ignore; + keepPaddingFlags = lookbehindFormat.keepPaddingFlags; + lookupFormat = lookbehindFormat; + } + } + + //Extract Method + private void lookAheadFormatsNotNull(LookaheadCharInputReader lookaheadInput, boolean matched){ + //lookaheadInput = input; + for (int i = 0; i < lookaheadFormats.length; i++) { + if (lookaheadInput.matches(ch, lookaheadFormats[i].value, wildcard)) { + lengths = lookaheadFormats[i].lengths; + alignments = lookaheadFormats[i].alignments; + paddings = lookaheadFormats[i].paddings; + ignore = lookaheadFormats[i].ignore; + keepPaddingFlags = lookaheadFormats[i].keepPaddingFlags; + lookupFormat = lookaheadFormats[i]; + matched = true; + break; + } + } + } + + //Extract Method + private void lookBehindFormatsNotNull(LookaheadCharInputReader lookaheadInput){ + //lookaheadInput = input; + lookbehindFormat = null; + for (int i = 0; i < lookbehindFormats.length; i++) { + if (lookaheadInput.matches(ch, lookbehindFormats[i].value, wildcard)) { + lookbehindFormat = lookbehindFormats[i]; + break; } } + } + + //Extract Method + private void lookAheadFormatsNull(LookaheadCharInputReader lookaheadInput, boolean matched){ + for (int i = 0; i < lookbehindFormats.length; i++) { + if (lookaheadInput.matches(ch, lookbehindFormats[i].value, wildcard)) { + lookbehindFormat = lookbehindFormats[i]; + matched = true; + lengths = rootLengths; + ignore = rootIgnore; + keepPaddingFlags = rootKeepPaddingFlags; + break; + } + } + } - int i; - for (i = 0; i < lengths.length; i++) { + //Extract Method + private void process(){ + for (int i = 0; i < lengths.length; i++) { final boolean ignorePadding = keepPaddingFlags[i] == null ? !keepPadding : !keepPaddingFlags[i]; length = lengths[i]; if (paddings != null) { @@ -249,7 +277,6 @@ protected void parseRecord() { skipToNewLine(); } useDefaultPadding = false; - } private void skipToNewLine() { From 4697ccac91027c4b6cfd6bf83b314833561575aa Mon Sep 17 00:00:00 2001 From: MustafaWasif Date: Mon, 18 Apr 2022 21:56:38 -0700 Subject: [PATCH 4/4] Test for multiple rows with no values --- .../univocity/parsers/csv/CsvParserTest.java | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/test/java/com/univocity/parsers/csv/CsvParserTest.java b/src/test/java/com/univocity/parsers/csv/CsvParserTest.java index 70beadd7..e06bb625 100755 --- a/src/test/java/com/univocity/parsers/csv/CsvParserTest.java +++ b/src/test/java/com/univocity/parsers/csv/CsvParserTest.java @@ -29,6 +29,24 @@ public class CsvParserTest extends ParserTestCase { + + @Test + public void FilterOutRowsWithNoValues() { + String test = "v11, v12, v13\n" + ",,,\n" + "v31, v32, v33\n" + "v41, v42, v43"; //contains multiple rows with no values + CsvParserSettings csvSettings = new CsvParserSettings(); + csvSettings.setSkipEmptyLines(true); + csvSettings.setSkipEmptyRecords(false); + csvSettings.setHeaderExtractionEnabled(true); + CsvParser parser = new CsvParser(csvSettings); + + List result = parser.parseAllRecords(new ByteArrayInputStream(test.getBytes())); + assertEquals(result.size(), 4); + + csvSettings.setSkipEmptyRecords(true); + result = parser.parseAllRecords(new ByteArrayInputStream(test.getBytes()) ); + assertEquals(result.size(), 3); + } + @DataProvider(name = "testProvider") public Object[][] testProvider() { return new Object[][]{ @@ -362,7 +380,9 @@ public void testReadEmptyValue() { CsvParser parser = new CsvParser(settings); parser.beginParsing(new StringReader("a,b,,c,\"\",\r\n")); + //parser.parse(new StringReader("a,b,,c,\"\",\r\n")); String[] row = parser.parseNext(); + List rows = processor.getRows(); assertEquals(row[0], "a"); assertEquals(row[1], "b"); @@ -370,6 +390,8 @@ public void testReadEmptyValue() { assertEquals(row[3], "c"); assertEquals(row[4], ""); assertEquals(row[5], null); + + } @DataProvider