diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 7256abe3e..b7b2c7437 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -44,8 +44,9 @@ public interface GlobalParserOptions { * ### Options for parsing [String]`?` columns * * @param locale locale to use for parsing dates and numbers, defaults to the System default locale. - * If specified instead of [dateTimeFormatter], it will be used (in combination with the optional [dateTimePattern]) - * to create a [DateTimeFormatter]. + * If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern] + * to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse + * locale-specific dates! * @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created * from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified, * [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used. diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt index 7e8aedc54..9c4fba3d1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DoubleParser.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.impl.io import ch.randelshofer.fastdoubleparser.JavaDoubleParser +import io.github.oshai.kotlinlogging.KotlinLogging import org.jetbrains.kotlinx.dataframe.api.ParserOptions import java.nio.charset.Charset import java.text.DecimalFormatSymbols @@ -8,6 +9,8 @@ import java.text.NumberFormat import java.text.ParsePosition import java.util.Locale +private val logger = KotlinLogging.logger {} + /** * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. * @@ -15,17 +18,27 @@ import java.util.Locale * [JavaDoubleParser]. This parser relies on [DecimalFormatBridge] to convert the input to a format that can be * parsed by [JavaDoubleParser] and thus may be unreliable. * - * Public so it can be used in other modules. + * Public, so it can be used in other modules. + * Open, so it may be modified with your own (better) [DecimalFormatBridge]. */ -public class DoubleParser(private val parserOptions: ParserOptions) { +public open class DoubleParser(private val parserOptions: ParserOptions) { + + protected val locale: Locale = parserOptions.locale ?: Locale.getDefault() - private val locale = parserOptions.locale ?: Locale.getDefault() + protected val supportedFastCharsets: Set = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) - private val bridge - get() = DecimalFormatBridge(from = locale, to = Locale.ROOT) + private val targetDecimalFormatSymbols = DecimalFormatSymbols.getInstance(Locale.ROOT) + .also { + it.groupingSeparator = Char.MIN_VALUE // remove groupingSeparator for FastJavaDoubleParser + } + protected open val bridge: DecimalFormatBridge + get() = DecimalFormatBridgeImpl( + from = DecimalFormatSymbols.getInstance(locale), + to = targetDecimalFormatSymbols, + ) // fallback method for parsing doubles - private fun String.parseToDoubleOrNull(): Double? = + protected fun String.parseToDoubleOrNull(): Double? = when (lowercase()) { "inf", "+inf", "infinity", "+infinity" -> Double.POSITIVE_INFINITY @@ -46,46 +59,62 @@ public class DoubleParser(private val parserOptions: ParserOptions) { result } } + }.also { + if (it == null) { + logger.debug { "Could not parse '$this' as Double with NumberFormat with locale '$locale'." } + } } - private val supportedFastCharsets = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII) - // parse a byte array of encoded strings into a double - public fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? { + public open fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? { if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) { + // first try to parse with JavaDoubleParser by converting the locale to ROOT + val array = bridge.convert(ba, charset) try { - // first try to parse with JavaDoubleParser by converting the locale to ROOT - val array = bridge.convert(ba) - return JavaDoubleParser.parseDouble(array) - } catch (_: Exception) { + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ba.toString(charset) + }' as '${ + array.toString(charset) + }' from a ByteArray to Double with FastDoubleParser with locale '$locale'." + } } } return ba.toString(charset).parseToDoubleOrNull() } - public fun parseOrNull(cs: CharSequence): Double? { + public open fun parseOrNull(cs: CharSequence): Double? { if (parserOptions.useFastDoubleParser) { + // first try to parse with JavaDoubleParser by converting the locale to ROOT + val converted = bridge.convert(cs.toString()) try { - // first try to parse with JavaDoubleParser by converting the locale to ROOT - val converted = bridge.convert(cs.toString()) - return JavaDoubleParser.parseDouble(converted) - } catch (_: Exception) { + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '$cs' as '$converted' from a CharSequence to Double with FastDoubleParser with locale '$locale'." + } } } return cs.toString().parseToDoubleOrNull() } - public fun parseOrNull(ca: CharArray): Double? { + public open fun parseOrNull(ca: CharArray): Double? { if (parserOptions.useFastDoubleParser) { + // first try to parse with JavaDoubleParser by converting the locale to ROOT + val converted = bridge.convert(ca) try { - // first try to parse with JavaDoubleParser by converting the locale to ROOT - val converted = bridge.convert(ca) - return JavaDoubleParser.parseDouble(converted) - } catch (_: Exception) { + } catch (e: Exception) { + logger.debug(e) { + "Failed to parse '${ + ca.joinToString("") + }' as '${ + converted.joinToString("") + }' from a CharArray to Double with FastDoubleParser with locale '$locale'." + } } } @@ -108,70 +137,157 @@ public class DoubleParser(private val parserOptions: ParserOptions) { * * May be removed if [Issue #82](https://github.com/wrandelshofer/FastDoubleParser/issues/82) is resolved. */ -internal class DecimalFormatBridge private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) { +public interface DecimalFormatBridge { + + public fun convert(string: String): String + + public fun convert(bytes: ByteArray, charset: Charset): ByteArray - companion object { + public fun convert(chars: CharArray): CharArray +} + +/** + * Implementation of [DecimalFormatBridge] which replaces individual characters in [DecimalFormatSymbols] + * of the source into those of the destination [DecimalFormatSymbols]. + * + * This catches most cases, but definitely not all. + * + * NOTE: Strings are not replaced. + */ +internal class DecimalFormatBridgeImpl private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) : + DecimalFormatBridge { + + companion object { + + private val memoizedBridges: + MutableMap, DecimalFormatBridge> = + mutableMapOf() + + operator fun invoke(from: Locale, to: Locale): DecimalFormatBridge = + invoke(DecimalFormatSymbols.getInstance(from), DecimalFormatSymbols.getInstance(to)) + + operator fun invoke(from: DecimalFormatSymbols, to: DecimalFormatSymbols): DecimalFormatBridge = + memoizedBridges.getOrPut(Pair(from, to)) { + DecimalFormatBridgeImpl( + from = from, + to = to, + ) + } + } - private val memoizedBridges: MutableMap, DecimalFormatBridge> = mutableMapOf() + // whether the conversion is a no-op + private val noop = from == to - operator fun invoke(from: Locale, to: Locale): DecimalFormatBridge = - memoizedBridges.getOrPut(from to to) { - DecimalFormatBridge( - DecimalFormatSymbols.getInstance(from), - DecimalFormatSymbols.getInstance(to), + private val charConversions: Map = listOf( + from.zeroDigit to to.zeroDigit, + from.groupingSeparator to to.groupingSeparator, + from.decimalSeparator to to.decimalSeparator, + from.perMill to to.perMill, + from.percent to to.percent, + from.digit to to.digit, + from.patternSeparator to to.patternSeparator, + from.minusSign to to.minusSign, + from.monetaryDecimalSeparator to to.monetaryDecimalSeparator, + ).filter { it.first != it.second } + .toMap() + + private fun getBytesConversions(charset: Charset): List> = + charConversions.map { (from, to) -> + Pair( + "$from".toByteArray(charset), + if (to == Char.MIN_VALUE) byteArrayOf() else "$to".toByteArray(charset), ) } - } + .sortedByDescending { it.first.size } + .distinctBy { it.first.toString(charset) } - // whether the conversion is a no-op - private val noop = from == to - - private val conversions: Map = listOf( - from.zeroDigit to to.zeroDigit, - from.groupingSeparator to to.groupingSeparator, - from.decimalSeparator to to.decimalSeparator, - from.perMill to to.perMill, - from.percent to to.percent, - from.digit to to.digit, - from.patternSeparator to to.patternSeparator, - from.minusSign to to.minusSign, - from.monetaryDecimalSeparator to to.monetaryDecimalSeparator, - ).filter { it.first != it.second } - .toMap() - - private val byteConversions = conversions - .map { (from, to) -> - from.code.toByte() to to.code.toByte() - }.toMap() - - fun convert(input: String): String { - if (noop || conversions.isEmpty()) return input - return input.map { conversions[it] ?: it }.joinToString("") - } + /** + * Lazy map (to be filled with [getBytesConversions]) per charset of [charConversions] + * that contains the char-as-byteArray version of [charConversions]. + * Useful if you want to convert a [String] as [ByteArray] to another [Locale]. + */ + private val bytesConversions: MutableMap>> = mutableMapOf() - fun convert(bytes: ByteArray): ByteArray { - val copy = bytes.copyOf() - convertInPlace(copy) - return copy - } + override fun convert(string: String): String { + if (noop || charConversions.isEmpty()) return string + return string.map { charConversions[it] ?: it }.joinToString("") + } + + override fun convert(bytes: ByteArray, charset: Charset): ByteArray { + val conversionMap = bytesConversions + .getOrPut(charset) { getBytesConversions(charset) } + + // gets all replacements as (index, original byte count, replacement) triples + val replacements = conversionMap.flatMap { (from, to) -> + bytes.findAllSubsequenceIndices(from).map { Triple(it, from.size, to) } + } + + // change bytes into a list of byte arrays, where each byte array is a single byte + val adjustableBytes = bytes.map { byteArrayOf(it) }.toMutableList() + + // perform each replacement in adjustableBytes + // if the new replacement is shorter than the original, remove bytes after the replacement + // if the new replacement is longer than the original, well that's why we have a byte array per original byte + for ((i, originalByteCount, replacement) in replacements) { + adjustableBytes[i] = replacement + for (j in (i + 1)..<(i + originalByteCount)) { + adjustableBytes[j] = byteArrayOf() + } + } - fun convertInPlace(bytes: ByteArray) { - if (noop || conversions.isEmpty()) return - for (i in bytes.indices) { - bytes[i] = byteConversions[bytes[i]] ?: bytes[i] + // flatten the list of byte arrays into a single byte array + val result = ByteArray(adjustableBytes.sumOf { it.size }) + var i = 0 + for (it in adjustableBytes) { + if (it.isEmpty()) continue + System.arraycopy(it, 0, result, i, it.size) + i += it.size + } + + return result + } + + override fun convert(chars: CharArray): CharArray { + val copy = chars.copyOf() + convertInPlace(copy) + return copy + } + + fun convertInPlace(chars: CharArray) { + if (noop || charConversions.isEmpty()) return + for (i in chars.indices) { + chars[i] = charConversions[chars[i]] ?: chars[i] + } } } - fun convert(chars: CharArray): CharArray { - val copy = chars.copyOf() - convertInPlace(copy) - return copy +// Helper function to find all subsequences in a byte array +internal fun ByteArray.findAllSubsequenceIndices(subBytes: ByteArray): List { + val result = mutableListOf() + if (subBytes.isEmpty()) { + return result } - fun convertInPlace(chars: CharArray) { - if (noop || conversions.isEmpty()) return - for (i in chars.indices) { - chars[i] = conversions[chars[i]] ?: chars[i] + val lenMain = this.size + val lenSub = subBytes.size + + var i = 0 + while (i <= lenMain - lenSub) { + var match = true + for (j in subBytes.indices) { + if (this[i + j] != subBytes[j]) { + match = false + break + } + } + + if (match) { + result.add(i) + i += lenSub // Move i forward by the length of the subsequence to avoid overlapping matches + } else { + i++ } } + + return result } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt new file mode 100644 index 000000000..1731be182 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DoubleParserTests.kt @@ -0,0 +1,234 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.collections.shouldContainInOrder +import org.jetbrains.kotlinx.dataframe.api.ParserOptions +import org.jetbrains.kotlinx.dataframe.impl.io.DecimalFormatBridgeImpl +import org.jetbrains.kotlinx.dataframe.impl.io.DoubleParser +import org.junit.Test +import java.util.Locale + +class DoubleParserTests { + + @Test + fun `can bridge from German locale`() { + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.GERMANY, Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can bridge from French locale`() { + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.FRANCE, Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can bridge from Estonian locale`() { + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedStrings = listOf( + "12.45", + "-13.35", + "100,123.35", + "-204,235.23", + "1.234e3", + "-345.122", + ) + // test bridge + val bridge = DecimalFormatBridgeImpl(Locale.forLanguageTag("et-EE"), Locale.ROOT) + + // CharSequence + numbers.map { bridge.convert(it) }.shouldContainInOrder(expectedStrings) + + // CharArray + numbers.map { bridge.convert(it.toCharArray()).joinToString("") }.shouldContainInOrder(expectedStrings) + + // ByteArray (must skip ASCII as NBSP and estonian minus sign are not ASCII) + for (charset in listOf(Charsets.UTF_8, Charsets.ISO_8859_1)) { + numbers.map { bridge.convert(it.toByteArray(charset), charset).toString(charset) } + .shouldContainInOrder(expectedStrings) + } + } + + @Test + fun `can fast parse doubles`() { + val parser = DoubleParser(ParserOptions(locale = Locale.ROOT, useFastDoubleParser = true)) + + val numbers = listOf( + "12.45", + "-13.35", + "100123.35", + "-204,235.23", + "1.234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse german locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.GERMANY, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100.123,35", + "-204.235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse french locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.FRANCE, useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "-13,35", + "100 123,35", + "-204 235,23", + "1,234e3", + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } + + @Test + fun `can fast parse estonian locale`() { + val parser = DoubleParser(ParserOptions(locale = Locale.forLanguageTag("et-EE"), useFastDoubleParser = true)) + + val numbers = listOf( + "12,45", + "−13,35", // note the different minus sign '−' vs '-' + "100 123,35", + "−204 235,23", // note the different minus sign '−' vs '-' + "1,234e3", + "-345,122", // check forgiving behavior with 'ordinary' minus sign + ) + + val expectedDoubles = listOf( + 12.45, + -13.35, + 100_123.35, + -204_235.23, + 1.234e3, + -345.122, + ) + + // CharSequence + numbers.map { parser.parseOrNull(it) }.shouldContainInOrder(expectedDoubles) + + // CharArray + numbers.map { parser.parseOrNull(it.toCharArray()) }.shouldContainInOrder(expectedDoubles) + + // ByteArray + numbers.map { parser.parseOrNull(it.toByteArray()) }.shouldContainInOrder(expectedDoubles) + } +}