From c7c06ab9d2ebbc75526076f02c3caa5f921af676 Mon Sep 17 00:00:00 2001 From: Kyle Moore Date: Thu, 30 Oct 2014 23:02:46 +0900 Subject: [PATCH 1/5] Adding test case for variable-width Japanese characters --- .../formatter/PrettyFormatterTest.java | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/java/src/test/java/gherkin/formatter/PrettyFormatterTest.java b/java/src/test/java/gherkin/formatter/PrettyFormatterTest.java index 4b2aff01..4a31b88b 100644 --- a/java/src/test/java/gherkin/formatter/PrettyFormatterTest.java +++ b/java/src/test/java/gherkin/formatter/PrettyFormatterTest.java @@ -99,6 +99,63 @@ public void shouldFormatAsDesigned() throws IOException { } + /** + * CJK and other character sets often contain 'fullwidth' characters. + *

+ * Fullwidth characters are only counted as a single character but occupy twice + * the space of a typical ASCII character when using a fixed-width font. + *

+ * For example, in Japanese this text is 2 characters, padded with 5 spaces to match the longer column below: + *

+ * + *

+ * The net result is that '|' and '#' characters will be misaligned when + * printing {@code --i18n ja} or a feature containing a mix of normal and fullwidth characters. + * + * The only way to resolve this is to keep a running tally of half- and full-width characters, + * then padding the output with a mix of trailing half- (u+0020) and full-width (u+3000) spaces. + *

+ * Repeating the above example, we now have: + *

+ * + * @see "http://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms" + * + * @throws IOException + */ + @Test + public void shouldFormatAsDesignedWithFullWidthCharacters() throws IOException { + + StringBuilder featureBuilder = new StringBuilder(); + featureBuilder.append("# language: ja\n"); + featureBuilder.append("機能: PrettyFormatter with Japanese\n"); + featureBuilder.append("シナリオ: Formmat beautifully\n"); + featureBuilder.append("もしI have this table:\n"); + featureBuilder.append("\t|名前|?の値1|\n"); + featureBuilder.append("\t|ab12ab12|ハンカク|\n"); + featureBuilder.append("ならばshould formatt beautifully.\n"); + String feature = featureBuilder.toString(); + + List lines = doFormatter(feature); + + assertEquals("Formatter produces unexpected quantity of lines. ", 8, lines.size()); + + assertEquals("# language: ja", lines.get(0)); + assertEquals("機能: PrettyFormatter with Japanese", lines.get(1)); + assertEquals("", lines.get(2)); + assertEquals(" シナリオ: Formmat beautifully", lines.get(3)); + assertEquals(" もしI have this table:", lines.get(4)); + assertEquals(" | 名前   | ?の値1 |", lines.get(5)); + assertEquals(" | ab12ab12 | ハンカク   |", lines.get(6)); + assertEquals(" ならばshould formatt beautifully.", lines.get(7)); + + } + @Test public void shouldAppendOnlyCompleteLinesAndFlushBetween() throws IOException { From 122875864409fb12ccbfc1ee860080a865c7d4e7 Mon Sep 17 00:00:00 2001 From: Kyle Moore Date: Fri, 31 Oct 2014 01:39:29 +0900 Subject: [PATCH 2/5] Fixed pretty formatting for Chinese/Japanese text * Passes PrettyFormatterTest#shouldFormatAsDesignedWithFullWidthCharacters * Swapped gherkin jar into cucumber-jvm; tested with cucumber -i118n * Results look good for zh-CH and ja; ko was still a little bit off * Not crazy about using multi-dimensional arrays. Probably need to think of a better way. --- .../gherkin/formatter/PrettyFormatter.java | 92 +++++++++++++++++-- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/java/src/main/java/gherkin/formatter/PrettyFormatter.java b/java/src/main/java/gherkin/formatter/PrettyFormatter.java index 060e519d..4ec2fed1 100644 --- a/java/src/main/java/gherkin/formatter/PrettyFormatter.java +++ b/java/src/main/java/gherkin/formatter/PrettyFormatter.java @@ -19,6 +19,7 @@ import gherkin.util.Mapper; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -27,6 +28,7 @@ import static gherkin.util.FixJava.join; import static gherkin.util.FixJava.map; +import static java.lang.Character.UnicodeBlock; /** * This class pretty prints feature files like they were in the source, only @@ -48,8 +50,8 @@ public String map(Tag tag) { }; private Formats formats; private Match match; - private int[][] cellLengths; - private int[] maxLengths; + private int[][][] cellLengths; + private int[][] maxLengths; private int rowIndex; private List rows; private Integer rowHeight = null; @@ -296,16 +298,28 @@ private void prepareTable(List rows) { } } - cellLengths = new int[rows.size()][columnCount]; - maxLengths = new int[columnCount]; + cellLengths = new int[rows.size()][columnCount][2]; + maxLengths = new int[columnCount][2]; for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { Row row = rows.get(rowIndex); final List cells = row.getCells(); for (int colIndex = 0; colIndex < columnCount; colIndex++) { final String cell = getCellSafely(cells, colIndex); - final int length = escapeCell(cell).length(); - cellLengths[rowIndex][colIndex] = length; - maxLengths[colIndex] = Math.max(maxLengths[colIndex], length); + final char[] chars = escapeCell(cell).toCharArray(); + int numNormalChars = 0; + int numFullWidthChars = 0; + for(char ch : chars) { + if(isFullWidthChar(ch)) { + numFullWidthChars++; + } else { + numNormalChars++; + } + + } + cellLengths[rowIndex][colIndex][0] = numNormalChars; + cellLengths[rowIndex][colIndex][1] = numFullWidthChars; + maxLengths[colIndex][0] = Math.max(maxLengths[colIndex][0], numNormalChars); + maxLengths[colIndex][1] = Math.max(maxLengths[colIndex][1], numFullWidthChars); } } rowIndex = 0; @@ -315,6 +329,57 @@ private String getCellSafely(final List cells, final int colIndex) { return (colIndex < cells.size()) ? cells.get(colIndex) : ""; } + private static final List LATIN = Arrays.asList( + UnicodeBlock.BASIC_LATIN, + UnicodeBlock.LATIN_1_SUPPLEMENT, + UnicodeBlock.LATIN_EXTENDED_A, + UnicodeBlock.LATIN_EXTENDED_B, + UnicodeBlock.LATIN_EXTENDED_ADDITIONAL + ); + + private static final List CJK = Arrays.asList( + UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS, + UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, + UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, + UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION, + UnicodeBlock.CJK_RADICALS_SUPPLEMENT, + UnicodeBlock.CJK_COMPATIBILITY, + UnicodeBlock.CJK_COMPATIBILITY_FORMS, + UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS, + UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, + UnicodeBlock.HANGUL_SYLLABLES, + UnicodeBlock.HANGUL_JAMO, + UnicodeBlock.HANGUL_COMPATIBILITY_JAMO, + UnicodeBlock.KATAKANA, + UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS, + UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS, + UnicodeBlock.HIRAGANA + ); + + /** + * The range U+FF61~U+FFDC is a special case; do not count these as full-width + * + * @param c a character + * @return True if half-width katakana (アカサタ等), false otherwise + */ + private boolean isHalfWidthKatakana(char c) { + return '\uFF61' <= c && + '\uFFDC' >= c; + } + + private boolean isFullWidthChar(char c) { + final UnicodeBlock block = UnicodeBlock.of(c); + + if(LATIN.contains(block)) { + return false; + } + if(CJK.contains(block) && !isHalfWidthKatakana(c)) { + return true; + } + + return false; + } + public void row(List cellResults) { StringBuilder buffer = new StringBuilder(); Row row = rows.get(rowIndex); @@ -358,7 +423,11 @@ public void row(List cellResults) { } Format format = formats.get(status); buffer.append(format.text(cellText)); - int padding = maxLengths[colIndex] - cellLengths[rowIndex][colIndex]; + int padding = Math.max(maxLengths[colIndex][0] - cellLengths[rowIndex][colIndex][0], 0); + int fullWidthPadding = Math.max(maxLengths[colIndex][1] - cellLengths[rowIndex][colIndex][1], 0); + // rpad with full-width spaces first, then normal spaces. + // the order is not significant but this way prevents padding of single spaces, followed by full-width, then followed with the space and pipe + padSpace(buffer, fullWidthPadding, true); padSpace(buffer, padding); if (colIndex < maxLengths.length - 1) { buffer.append(" | "); @@ -440,8 +509,13 @@ private void calculateLocationIndentations() { } private void padSpace(StringBuilder buffer, int indent) { + padSpace(buffer, indent, false); + } + + private void padSpace(StringBuilder buffer, int indent, boolean useFullWidth) { + char whitespace = useFullWidth ? '\u3000' : ' '; for (int i = 0; i < indent; i++) { - buffer.append(" "); + buffer.append(whitespace); } } From 017c5146c5017449fa3819d7f480d11a165f23b4 Mon Sep 17 00:00:00 2001 From: Kyle Moore Date: Fri, 31 Oct 2014 01:48:15 +0900 Subject: [PATCH 3/5] Remove unnecessary rounding --- java/src/main/java/gherkin/formatter/PrettyFormatter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/gherkin/formatter/PrettyFormatter.java b/java/src/main/java/gherkin/formatter/PrettyFormatter.java index 4ec2fed1..b85958d3 100644 --- a/java/src/main/java/gherkin/formatter/PrettyFormatter.java +++ b/java/src/main/java/gherkin/formatter/PrettyFormatter.java @@ -423,8 +423,8 @@ public void row(List cellResults) { } Format format = formats.get(status); buffer.append(format.text(cellText)); - int padding = Math.max(maxLengths[colIndex][0] - cellLengths[rowIndex][colIndex][0], 0); - int fullWidthPadding = Math.max(maxLengths[colIndex][1] - cellLengths[rowIndex][colIndex][1], 0); + int padding = maxLengths[colIndex][0] - cellLengths[rowIndex][colIndex][0]; + int fullWidthPadding = maxLengths[colIndex][1] - cellLengths[rowIndex][colIndex][1]; // rpad with full-width spaces first, then normal spaces. // the order is not significant but this way prevents padding of single spaces, followed by full-width, then followed with the space and pipe padSpace(buffer, fullWidthPadding, true); From b11feb333d631c78da8882f5bc955239b012553d Mon Sep 17 00:00:00 2001 From: Kyle Moore Date: Sat, 1 Nov 2014 14:36:22 +0900 Subject: [PATCH 4/5] Use descriptive getters/setters instead of incomprehensible three-dimensional array references --- .../gherkin/formatter/PrettyFormatter.java | 47 ++++++++++++++++--- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/gherkin/formatter/PrettyFormatter.java b/java/src/main/java/gherkin/formatter/PrettyFormatter.java index b85958d3..6d2d5814 100644 --- a/java/src/main/java/gherkin/formatter/PrettyFormatter.java +++ b/java/src/main/java/gherkin/formatter/PrettyFormatter.java @@ -316,10 +316,10 @@ private void prepareTable(List rows) { } } - cellLengths[rowIndex][colIndex][0] = numNormalChars; - cellLengths[rowIndex][colIndex][1] = numFullWidthChars; - maxLengths[colIndex][0] = Math.max(maxLengths[colIndex][0], numNormalChars); - maxLengths[colIndex][1] = Math.max(maxLengths[colIndex][1], numFullWidthChars); + setNumberOfNormalWidthCharsInCell(cellLengths[rowIndex][colIndex], numNormalChars); + setNumberOfFullWidthCharsInCell(cellLengths[rowIndex][colIndex], numFullWidthChars); + updateMaxLengthOfNormalWidthCharsForColumn(maxLengths[colIndex], numNormalChars); + updateMaxLengthOfFullWidthCharsForColumn(maxLengths[colIndex], numFullWidthChars); } } rowIndex = 0; @@ -329,6 +329,38 @@ private String getCellSafely(final List cells, final int colIndex) { return (colIndex < cells.size()) ? cells.get(colIndex) : ""; } + private int getNumberOfNormalWidthCharsInCell(int[] cellLength) { + return cellLength[0]; + } + + private void setNumberOfNormalWidthCharsInCell(int[] cellLength, int numNormalChars) { + cellLength[0] = numNormalChars; + } + + private int getNumberOfFullWidthCharsInCell(int[] cellLength) { + return cellLength[1]; + } + + private void setNumberOfFullWidthCharsInCell(int[] cellLength, int numFullWidthChars) { + cellLength[1] = numFullWidthChars; + } + + private int getMaxLengthOfNormalWidthCharsForColumn(int[] maxLength) { + return maxLength[0]; + } + + private void updateMaxLengthOfNormalWidthCharsForColumn(int[] maxLength, int numNormalChars) { + maxLength[0] = Math.max(maxLength[0], numNormalChars); + } + + private int getMaxLengthOfFullWidthCharsForColumn(int[] maxLength) { + return maxLength[1]; + } + + private void updateMaxLengthOfFullWidthCharsForColumn(int[] maxLength, int numFullWidthChars) { + maxLength[1] = Math.max(maxLength[1], numFullWidthChars); + } + private static final List LATIN = Arrays.asList( UnicodeBlock.BASIC_LATIN, UnicodeBlock.LATIN_1_SUPPLEMENT, @@ -423,10 +455,11 @@ public void row(List cellResults) { } Format format = formats.get(status); buffer.append(format.text(cellText)); - int padding = maxLengths[colIndex][0] - cellLengths[rowIndex][colIndex][0]; - int fullWidthPadding = maxLengths[colIndex][1] - cellLengths[rowIndex][colIndex][1]; + int padding = getMaxLengthOfNormalWidthCharsForColumn(maxLengths[colIndex]) - getNumberOfNormalWidthCharsInCell(cellLengths[rowIndex][colIndex]); + int fullWidthPadding = getMaxLengthOfFullWidthCharsForColumn(maxLengths[colIndex]) - getNumberOfFullWidthCharsInCell(cellLengths[rowIndex][colIndex]); // rpad with full-width spaces first, then normal spaces. - // the order is not significant but this way prevents padding of single spaces, followed by full-width, then followed with the space and pipe + // the order is not significant but this way prevents inconsistend padding + // such as: single spaces, followed by full-width spaces, then followed with a final single space and pipe delimiter padSpace(buffer, fullWidthPadding, true); padSpace(buffer, padding); if (colIndex < maxLengths.length - 1) { From 7ce982077b9d5fb0efdca3004ada8a9450f25152 Mon Sep 17 00:00:00 2001 From: Kyle Moore Date: Sat, 1 Nov 2014 14:36:54 +0900 Subject: [PATCH 5/5] Simply full-width detection logic --- .../gherkin/formatter/PrettyFormatter.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/java/src/main/java/gherkin/formatter/PrettyFormatter.java b/java/src/main/java/gherkin/formatter/PrettyFormatter.java index 6d2d5814..57e8c9c6 100644 --- a/java/src/main/java/gherkin/formatter/PrettyFormatter.java +++ b/java/src/main/java/gherkin/formatter/PrettyFormatter.java @@ -399,17 +399,17 @@ private boolean isHalfWidthKatakana(char c) { '\uFFDC' >= c; } + /** + * The majority of characters passed in will be in the LATIN collection. + * Therefore we check there first, short-circuit and return as soon as possible. + * + * @param c The char to evaluate + * @return True if it is a Full-Width character, false otherwise + */ private boolean isFullWidthChar(char c) { final UnicodeBlock block = UnicodeBlock.of(c); - - if(LATIN.contains(block)) { - return false; - } - if(CJK.contains(block) && !isHalfWidthKatakana(c)) { - return true; - } - - return false; + return(!LATIN.contains(block) && + (CJK.contains(block) && !isHalfWidthKatakana(c))); } public void row(List cellResults) {