From e2e8c05751136231126086d13858e80661a0629f Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 10 Oct 2024 15:00:29 +0200 Subject: [PATCH] improved DoubleParser and DecimalFormatBridge such that characters spanning multiple bytes can now also be converted to other locales before being parsed. Added tests --- .../jetbrains/kotlinx/dataframe/api/parse.kt | 5 +- .../kotlinx/dataframe/impl/io/DoubleParser.kt | 218 +++++++++++----- .../kotlinx/dataframe/io/DoubleParserTests.kt | 234 ++++++++++++++++++ 3 files changed, 391 insertions(+), 66 deletions(-) create mode 100644 core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 7256abe3ec..b7b2c74377 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -44,8 +44,9 @@ public interface GlobalParserOptions { * ### Options for parsing [String]`?` columns * * @param locale locale to use for parsing dates and numbers, defaults to the System default locale. - * If specified instead of [dateTimeFormatter], it will be used (in combination with the optional [dateTimePattern]) - * to create a [DateTimeFormatter]. + * If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern] + * to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse + * locale-specific dates! * @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created * from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified, * [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used. diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt index 7e8aedc542..64226b181f 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt @@ -15,17 +15,19 @@ import java.util.Locale * [JavaDoubleParser]. This parser relies on [DecimalFormatBridge] to convert the input to a format that can be * parsed by [JavaDoubleParser] and thus may be unreliable. * - * Public so it can be used in other modules. + * Public, so it can be used in other modules. + * Open, so it may be modified with your own (better) [DecimalFormatBridge]. */ -public class DoubleParser(private val parserOptions: ParserOptions) { +public open class DoubleParser(private val parserOptions: ParserOptions) { - private val locale = parserOptions.locale ?: Locale.getDefault() + protected val locale: Locale = parserOptions.locale ?: Locale.getDefault() - private val bridge - get() = DecimalFormatBridge(from = locale, to = Locale.ROOT) + protected val supportedFastCharsets: Set = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) + protected open val bridge: DecimalFormatBridge + get() = DecimalFormatBridgeImpl(from = locale, to = Locale.ROOT) // fallback method for parsing doubles - private fun String.parseToDoubleOrNull(): Double? = + protected fun String.parseToDoubleOrNull(): Double? = when (lowercase()) { "inf", "+inf", "infinity", "+infinity" -> Double.POSITIVE_INFINITY @@ -46,16 +48,17 @@ public class DoubleParser(private val parserOptions: ParserOptions) { result } } + } ?: let { + println("could not parse $this as double, returning null.") + null } - private val supportedFastCharsets = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) - // parse a byte array of encoded strings into a double - public fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? { + public open fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? { if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) { try { // first try to parse with JavaDoubleParser by converting the locale to ROOT - val array = bridge.convert(ba) + val array = bridge.convert(ba, charset) return JavaDoubleParser.parseDouble(array) } catch (_: Exception) { @@ -64,7 +67,7 @@ public class DoubleParser(private val parserOptions: ParserOptions) { return ba.toString(charset).parseToDoubleOrNull() } - public fun parseOrNull(cs: CharSequence): Double? { + public open fun parseOrNull(cs: CharSequence): Double? { if (parserOptions.useFastDoubleParser) { try { // first try to parse with JavaDoubleParser by converting the locale to ROOT @@ -78,7 +81,7 @@ public class DoubleParser(private val parserOptions: ParserOptions) { return cs.toString().parseToDoubleOrNull() } - public fun parseOrNull(ca: CharArray): Double? { + public open fun parseOrNull(ca: CharArray): Double? { if (parserOptions.useFastDoubleParser) { try { // first try to parse with JavaDoubleParser by converting the locale to ROOT @@ -108,70 +111,157 @@ public class DoubleParser(private val parserOptions: ParserOptions) { * * May be removed if [Issue #82](https://github.com/wrandelshofer/FastDoubleParser/issues/82) is resolved. */ -internal class DecimalFormatBridge private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) { +public interface DecimalFormatBridge { + + public fun convert(string: String): String + + public fun convert(bytes: ByteArray, charset: Charset): ByteArray + + public fun convert(chars: CharArray): CharArray +} + +/** + * Implementation of [DecimalFormatBridge] which replaces individual characters in [DecimalFormatSymbols] + * of the source [Locale] into those of the destination [Locale]. + * + * This catches most cases, but definitely not all. + * + * NOTE: Strings are not replaced. + */ +internal class DecimalFormatBridgeImpl private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) : + DecimalFormatBridge { - companion object { + companion object { - private val memoizedBridges: MutableMap, DecimalFormatBridge> = mutableMapOf() + private val memoizedBridges: MutableMap>, DecimalFormatBridge> = + mutableMapOf() + + operator fun invoke( + from: Locale, + to: Locale, + supportedCharsets: Set = setOf( + Charsets.UTF_8, + Charsets.ISO_8859_1, + Charsets.US_ASCII, + ), + ): DecimalFormatBridge = + memoizedBridges.getOrPut(Triple(from, to, supportedCharsets)) { + DecimalFormatBridgeImpl( + from = DecimalFormatSymbols.getInstance(from), + to = DecimalFormatSymbols.getInstance(to), + ) + } + } + + // whether the conversion is a no-op + private val noop = from == to + + private val charConversions: Map = listOf( + from.zeroDigit to to.zeroDigit, + from.groupingSeparator to to.groupingSeparator, + from.decimalSeparator to to.decimalSeparator, + from.perMill to to.perMill, + from.percent to to.percent, + from.digit to to.digit, + from.patternSeparator to to.patternSeparator, + from.minusSign to to.minusSign, + from.monetaryDecimalSeparator to to.monetaryDecimalSeparator, + ).filter { it.first != it.second } + .toMap() + + private fun getBytesConversions(charset: Charset): List> = + charConversions + .map { (from, to) -> "$from".toByteArray(charset) to "$to".toByteArray(charset) } + .sortedByDescending { it.first.size } + .distinctBy { it.first.toString(charset) } + + /** + * Lazy map (to be filled with [getBytesConversions]) per charset of [charConversions] + * that contains the char-as-byteArray version of [charConversions]. + * Useful if you want to convert a [String] as [ByteArray] to another [Locale]. + */ + private val bytesConversions: MutableMap>> = mutableMapOf() + + override fun convert(string: String): String { + if (noop || charConversions.isEmpty()) return string + return string.map { charConversions[it] ?: it }.joinToString("") + } - operator fun invoke(from: Locale, to: Locale): DecimalFormatBridge = - memoizedBridges.getOrPut(from to to) { - DecimalFormatBridge( - DecimalFormatSymbols.getInstance(from), - DecimalFormatSymbols.getInstance(to), - ) + override fun convert(bytes: ByteArray, charset: Charset): ByteArray { + val conversionMap = bytesConversions + .getOrPut(charset) { getBytesConversions(charset) } + + // gets all replacements as (index, original byte count, replacement) triples + val replacements = conversionMap.flatMap { (from, to) -> + bytes.findAllSubsequenceIndices(from).map { Triple(it, from.size, to) } } - } - // whether the conversion is a no-op - private val noop = from == to - - private val conversions: Map = listOf( - from.zeroDigit to to.zeroDigit, - from.groupingSeparator to to.groupingSeparator, - from.decimalSeparator to to.decimalSeparator, - from.perMill to to.perMill, - from.percent to to.percent, - from.digit to to.digit, - from.patternSeparator to to.patternSeparator, - from.minusSign to to.minusSign, - from.monetaryDecimalSeparator to to.monetaryDecimalSeparator, - ).filter { it.first != it.second } - .toMap() - - private val byteConversions = conversions - .map { (from, to) -> - from.code.toByte() to to.code.toByte() - }.toMap() - - fun convert(input: String): String { - if (noop || conversions.isEmpty()) return input - return input.map { conversions[it] ?: it }.joinToString("") - } + // change bytes into a list of byte arrays, where each byte array is a single byte + val adjustableBytes = bytes.map { byteArrayOf(it) }.toMutableList() - fun convert(bytes: ByteArray): ByteArray { - val copy = bytes.copyOf() - convertInPlace(copy) - return copy - } + // perform each replacement in adjustableBytes + // if the new replacement is shorter than the original, remove bytes after the replacement + // if the new replacement is longer than the original, well that's why we have a byte array per original byte + for ((i, originalByteCount, replacement) in replacements) { + adjustableBytes[i] = replacement + for (j in (i + 1)..<(i + originalByteCount)) { + adjustableBytes[j] = byteArrayOf() + } + } + + // flatten the list of byte arrays into a single byte array + val result = ByteArray(adjustableBytes.sumOf { it.size }) + var i = 0 + for (it in adjustableBytes) { + if (it.isEmpty()) continue + System.arraycopy(it, 0, result, i, it.size) + i += it.size + } + + return result + } - fun convertInPlace(bytes: ByteArray) { - if (noop || conversions.isEmpty()) return - for (i in bytes.indices) { - bytes[i] = byteConversions[bytes[i]] ?: bytes[i] + override fun convert(chars: CharArray): CharArray { + val copy = chars.copyOf() + convertInPlace(copy) + return copy + } + + fun convertInPlace(chars: CharArray) { + if (noop || charConversions.isEmpty()) return + for (i in chars.indices) { + chars[i] = charConversions[chars[i]] ?: chars[i] + } } } - fun convert(chars: CharArray): CharArray { - val copy = chars.copyOf() - convertInPlace(copy) - return copy +// Helper function to find all subsequences in a byte array +internal fun ByteArray.findAllSubsequenceIndices(subBytes: ByteArray): List { + val result = mutableListOf() + if (subBytes.isEmpty()) { + return result } - fun convertInPlace(chars: CharArray) { - if (noop || conversions.isEmpty()) return - for (i in chars.indices) { - chars[i] = conversions[chars[i]] ?: chars[i] + val lenMain = this.size + val lenSub = subBytes.size + + var i = 0 + while (i <= lenMain - lenSub) { + var match = true + for (j in subBytes.indices) { + if (this[i + j] != subBytes[j]) { + match = false + break + } + } + + if (match) { + result.add(i) + i += lenSub // Move i forward by the length of the subsequence to avoid overlapping matches + } else { + i++ } } + + return result } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt new file mode 100644 index 0000000000..1731be182f --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt @@ -0,0 +1,234 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.collections.shouldContainInOrder +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import org.jetbrains.kotlinx.dataframe.impl.io.DecimalFormatBridgeImpl +import org.jetbrains.kotlinx.dataframe.impl.io.DoubleParser +import org.junit.Test +import java.util.Locale + +class DoubleParserTests { + + @Test + fun `can bridge from German locale`() { + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.GERMANY, Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can bridge from French locale`() { + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.FRANCE, Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can bridge from Estonian locale`() { + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + "-345.122", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.forLanguageTag("et-EE"), Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray (must skip ASCII as NBSP and estonian minus sign are not ASCII) + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can fast parse doubles`() { + val parser = DoubleParser(ParserOptions(locale = Locale.ROOT, useFastDoubleParser = true)) + + val numbers = listOf( + "12.45", + "-13.35", + "100123.35", + "-204,235.23", + "1.234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse german locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.GERMANY, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse french locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.FRANCE, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse estonian locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.forLanguageTag("et-EE"), useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } +}