Skip to content

Commit

Permalink
improved DoubleParser and DecimalFormatBridge such that characters sp…
Browse files Browse the repository at this point in the history
…anning multiple bytes can now also be converted to other locales before being parsed. Added tests
  • Loading branch information
Jolanrensen committed Oct 10, 2024
1 parent ac8a5e5 commit e2e8c05
Show file tree
Hide file tree
Showing 3 changed files with 391 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ public interface GlobalParserOptions {
* ### Options for parsing [String]`?` columns
*
* @param locale locale to use for parsing dates and numbers, defaults to the System default locale.
* If specified instead of [dateTimeFormatter], it will be used (in combination with the optional [dateTimePattern])
* to create a [DateTimeFormatter].
* If specified instead of [dateTimeFormatter], it will be used in combination with [dateTimePattern]
* to create a [DateTimeFormatter]. Just providing [locale] will not allow you to parse
* locale-specific dates!
* @param dateTimeFormatter a [DateTimeFormatter] to use for parsing dates, if not specified, it will be created
* from [dateTimePattern] and [locale]. If neither [dateTimeFormatter] nor [dateTimePattern] are specified,
* [DateTimeFormatter.ISO_LOCAL_DATE_TIME] will be used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,19 @@ import java.util.Locale
* [JavaDoubleParser]. This parser relies on [DecimalFormatBridge] to convert the input to a format that can be
* parsed by [JavaDoubleParser] and thus may be unreliable.
*
* Public so it can be used in other modules.
* Public, so it can be used in other modules.
* Open, so it may be modified with your own (better) [DecimalFormatBridge].
*/
public class DoubleParser(private val parserOptions: ParserOptions) {
public open class DoubleParser(private val parserOptions: ParserOptions) {

private val locale = parserOptions.locale ?: Locale.getDefault()
protected val locale: Locale = parserOptions.locale ?: Locale.getDefault()

private val bridge
get() = DecimalFormatBridge(from = locale, to = Locale.ROOT)
protected val supportedFastCharsets: Set<Charset> = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)
protected open val bridge: DecimalFormatBridge
get() = DecimalFormatBridgeImpl(from = locale, to = Locale.ROOT)

// fallback method for parsing doubles
private fun String.parseToDoubleOrNull(): Double? =
protected fun String.parseToDoubleOrNull(): Double? =
when (lowercase()) {
"inf", "+inf", "infinity", "+infinity" -> Double.POSITIVE_INFINITY

Expand All @@ -46,16 +48,17 @@ public class DoubleParser(private val parserOptions: ParserOptions) {
result
}
}
} ?: let {
println("could not parse $this as double, returning null.")
null
}

private val supportedFastCharsets = setOf(Charsets.UTF_8, Charsets.ISO_8859_1, Charsets.US_ASCII)

// parse a byte array of encoded strings into a double
public fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? {
public open fun parseOrNull(ba: ByteArray, charset: Charset = Charsets.UTF_8): Double? {
if (parserOptions.useFastDoubleParser && charset in supportedFastCharsets) {
try {
// first try to parse with JavaDoubleParser by converting the locale to ROOT
val array = bridge.convert(ba)
val array = bridge.convert(ba, charset)

return JavaDoubleParser.parseDouble(array)
} catch (_: Exception) {
Expand All @@ -64,7 +67,7 @@ public class DoubleParser(private val parserOptions: ParserOptions) {
return ba.toString(charset).parseToDoubleOrNull()
}

public fun parseOrNull(cs: CharSequence): Double? {
public open fun parseOrNull(cs: CharSequence): Double? {
if (parserOptions.useFastDoubleParser) {
try {
// first try to parse with JavaDoubleParser by converting the locale to ROOT
Expand All @@ -78,7 +81,7 @@ public class DoubleParser(private val parserOptions: ParserOptions) {
return cs.toString().parseToDoubleOrNull()
}

public fun parseOrNull(ca: CharArray): Double? {
public open fun parseOrNull(ca: CharArray): Double? {
if (parserOptions.useFastDoubleParser) {
try {
// first try to parse with JavaDoubleParser by converting the locale to ROOT
Expand Down Expand Up @@ -108,70 +111,157 @@ public class DoubleParser(private val parserOptions: ParserOptions) {
*
* May be removed if [Issue #82](https://github.com/wrandelshofer/FastDoubleParser/issues/82) is resolved.
*/
internal class DecimalFormatBridge private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) {
public interface DecimalFormatBridge {

public fun convert(string: String): String

public fun convert(bytes: ByteArray, charset: Charset): ByteArray

public fun convert(chars: CharArray): CharArray
}

/**
* Implementation of [DecimalFormatBridge] which replaces individual characters in [DecimalFormatSymbols]
* of the source [Locale] into those of the destination [Locale].
*
* This catches most cases, but definitely not all.
*
* NOTE: Strings are not replaced.
*/
internal class DecimalFormatBridgeImpl private constructor(from: DecimalFormatSymbols, to: DecimalFormatSymbols) :
DecimalFormatBridge {

companion object {
companion object {

private val memoizedBridges: MutableMap<Pair<Locale, Locale>, DecimalFormatBridge> = mutableMapOf()
private val memoizedBridges: MutableMap<Triple<Locale, Locale, Set<Charset>>, DecimalFormatBridge> =
mutableMapOf()

operator fun invoke(
from: Locale,
to: Locale,
supportedCharsets: Set<Charset> = setOf(
Charsets.UTF_8,
Charsets.ISO_8859_1,
Charsets.US_ASCII,
),
): DecimalFormatBridge =
memoizedBridges.getOrPut(Triple(from, to, supportedCharsets)) {
DecimalFormatBridgeImpl(
from = DecimalFormatSymbols.getInstance(from),
to = DecimalFormatSymbols.getInstance(to),
)
}
}

// whether the conversion is a no-op
private val noop = from == to

private val charConversions: Map<Char, Char> = listOf(
from.zeroDigit to to.zeroDigit,
from.groupingSeparator to to.groupingSeparator,
from.decimalSeparator to to.decimalSeparator,
from.perMill to to.perMill,
from.percent to to.percent,
from.digit to to.digit,
from.patternSeparator to to.patternSeparator,
from.minusSign to to.minusSign,
from.monetaryDecimalSeparator to to.monetaryDecimalSeparator,
).filter { it.first != it.second }
.toMap()

private fun getBytesConversions(charset: Charset): List<Pair<ByteArray, ByteArray>> =
charConversions
.map { (from, to) -> "$from".toByteArray(charset) to "$to".toByteArray(charset) }
.sortedByDescending { it.first.size }
.distinctBy { it.first.toString(charset) }

/**
* Lazy map (to be filled with [getBytesConversions]) per charset of [charConversions]
* that contains the char-as-byteArray version of [charConversions].
* Useful if you want to convert a [String] as [ByteArray] to another [Locale].
*/
private val bytesConversions: MutableMap<Charset, List<Pair<ByteArray, ByteArray>>> = mutableMapOf()

override fun convert(string: String): String {
if (noop || charConversions.isEmpty()) return string
return string.map { charConversions[it] ?: it }.joinToString("")
}

operator fun invoke(from: Locale, to: Locale): DecimalFormatBridge =
memoizedBridges.getOrPut(from to to) {
DecimalFormatBridge(
DecimalFormatSymbols.getInstance(from),
DecimalFormatSymbols.getInstance(to),
)
override fun convert(bytes: ByteArray, charset: Charset): ByteArray {
val conversionMap = bytesConversions
.getOrPut(charset) { getBytesConversions(charset) }

// gets all replacements as (index, original byte count, replacement) triples
val replacements = conversionMap.flatMap { (from, to) ->
bytes.findAllSubsequenceIndices(from).map { Triple(it, from.size, to) }
}
}

// whether the conversion is a no-op
private val noop = from == to

private val conversions: Map<Char, Char> = listOf(
from.zeroDigit to to.zeroDigit,
from.groupingSeparator to to.groupingSeparator,
from.decimalSeparator to to.decimalSeparator,
from.perMill to to.perMill,
from.percent to to.percent,
from.digit to to.digit,
from.patternSeparator to to.patternSeparator,
from.minusSign to to.minusSign,
from.monetaryDecimalSeparator to to.monetaryDecimalSeparator,
).filter { it.first != it.second }
.toMap()

private val byteConversions = conversions
.map { (from, to) ->
from.code.toByte() to to.code.toByte()
}.toMap()

fun convert(input: String): String {
if (noop || conversions.isEmpty()) return input
return input.map { conversions[it] ?: it }.joinToString("")
}
// change bytes into a list of byte arrays, where each byte array is a single byte
val adjustableBytes = bytes.map { byteArrayOf(it) }.toMutableList()

fun convert(bytes: ByteArray): ByteArray {
val copy = bytes.copyOf()
convertInPlace(copy)
return copy
}
// perform each replacement in adjustableBytes
// if the new replacement is shorter than the original, remove bytes after the replacement
// if the new replacement is longer than the original, well that's why we have a byte array per original byte
for ((i, originalByteCount, replacement) in replacements) {
adjustableBytes[i] = replacement
for (j in (i + 1)..<(i + originalByteCount)) {
adjustableBytes[j] = byteArrayOf()
}
}

// flatten the list of byte arrays into a single byte array
val result = ByteArray(adjustableBytes.sumOf { it.size })
var i = 0
for (it in adjustableBytes) {
if (it.isEmpty()) continue
System.arraycopy(it, 0, result, i, it.size)
i += it.size
}

return result
}

fun convertInPlace(bytes: ByteArray) {
if (noop || conversions.isEmpty()) return
for (i in bytes.indices) {
bytes[i] = byteConversions[bytes[i]] ?: bytes[i]
override fun convert(chars: CharArray): CharArray {
val copy = chars.copyOf()
convertInPlace(copy)
return copy
}

fun convertInPlace(chars: CharArray) {
if (noop || charConversions.isEmpty()) return
for (i in chars.indices) {
chars[i] = charConversions[chars[i]] ?: chars[i]
}
}
}

fun convert(chars: CharArray): CharArray {
val copy = chars.copyOf()
convertInPlace(copy)
return copy
// Helper function to find all subsequences in a byte array
internal fun ByteArray.findAllSubsequenceIndices(subBytes: ByteArray): List<Int> {
val result = mutableListOf<Int>()
if (subBytes.isEmpty()) {
return result
}

fun convertInPlace(chars: CharArray) {
if (noop || conversions.isEmpty()) return
for (i in chars.indices) {
chars[i] = conversions[chars[i]] ?: chars[i]
val lenMain = this.size
val lenSub = subBytes.size

var i = 0
while (i <= lenMain - lenSub) {
var match = true
for (j in subBytes.indices) {
if (this[i + j] != subBytes[j]) {
match = false
break
}
}

if (match) {
result.add(i)
i += lenSub // Move i forward by the length of the subsequence to avoid overlapping matches
} else {
i++
}
}

return result
}
Loading

0 comments on commit e2e8c05

Please sign in to comment.