diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 149a33d65..b216c7d1c 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -7,6 +7,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.impl.api.Parsers +import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl import org.jetbrains.kotlinx.dataframe.typeClass @@ -55,6 +56,17 @@ public data class ParserOptions( } } +/** Tries to parse a column of strings into a column of a different type. + * Each parser in [Parsers][org.jetbrains.kotlinx.dataframe.impl.api.Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. If all the others fail, the final parser + * simply returns the original string, leaving the column unchanged. + * + * Parsers that are [covered by][org.jetbrains.kotlinx.dataframe.impl.api.StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled) + * @return a new column with parsed values */ public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options) public fun DataFrame.parse(options: ParserOptions? = null): DataFrame = @@ -62,6 +74,21 @@ public fun DataFrame.parse(options: ParserOptions? = null): DataFrame colsAtAnyDepth { !it.isColumnGroup() } } +/** + * Tries to parse a column of strings into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. + * + * If all fail [IllegalStateException] is thrown. If you don't want this exception to be thrown, + * use [tryParse] instead. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found + * @return a new column with parsed values + */ public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt new file mode 100644 index 000000000..26de7a117 --- /dev/null +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt @@ -0,0 +1,151 @@ +package org.jetbrains.kotlinx.dataframe.impl + +import kotlin.time.Duration +import kotlin.time.DurationUnit + +/** + * Checks if the string can be parsed as a duration without throwing an exception. + * + * The logic is taken from [Duration.parse] (Kotlin version 2.0.20), + * so it should return the same result. + * + * @param value the string to check + */ +internal fun Duration.Companion.canParse(value: String): Boolean { + var length = value.length + if (length == 0) return false + var index = 0 + val infinityString = "Infinity" + when (value[index]) { + '+', '-' -> index++ + } + val hasSign = index > 0 + when { + length <= index -> return false + + value[index] == 'P' -> { + if (++index == length) return false + val nonDigitSymbols = "+-." + var isTimeComponent = false + var prevUnit: DurationUnit? = null + while (index < length) { + if (value[index] == 'T') { + if (isTimeComponent || ++index == length) return false + isTimeComponent = true + continue + } + val component = value.substringWhile(index) { it in '0'..'9' || it in nonDigitSymbols } + if (component.isEmpty()) return false + index += component.length + val unitChar = value.getOrElse(index) { return false } + index++ + val unit = durationUnitByIsoCharOrNull(unitChar, isTimeComponent) ?: return false + if (prevUnit != null && prevUnit <= unit) return false + prevUnit = unit + } + } + + value.regionMatches( + thisOffset = index, + other = infinityString, + otherOffset = 0, + length = maxOf(length - index, infinityString.length), + ignoreCase = true, + ) -> return true + + else -> { + // parse default string format + var prevUnit: DurationUnit? = null + var afterFirst = false + var allowSpaces = !hasSign + if (hasSign && value[index] == '(' && value.last() == ')') { + allowSpaces = true + if (++index == --length) return false + } + while (index < length) { + if (afterFirst && allowSpaces) { + index = value.skipWhile(index) { it == ' ' } + } + afterFirst = true + val component = value.substringWhile(index) { it in '0'..'9' || it == '.' } + if (component.isEmpty()) return false + index += component.length + val unitName = value.substringWhile(index) { it in 'a'..'z' } + index += unitName.length + val unit = durationUnitByShortNameOrNull(unitName) ?: return false + if (prevUnit != null && prevUnit <= unit) return false + prevUnit = unit + val dotIndex = component.indexOf('.') + if (dotIndex > 0) { + if (index < length) return false + } + } + } + } + return true +} + +/** + * Checks if the string can be parsed as a java duration without throwing an exception. + */ +internal fun javaDurationCanParse(value: String): Boolean = isoDurationRegex.matches(value) + +/** + * regex from [java.time.Duration.Lazy.PATTERN], it represents the ISO-8601 duration format. + */ +private val isoDurationRegex = Regex( + """([-+]?)P(?:([-+]?[0-9]+)D)?(T(?:([-+]?[0-9]+)H)?(?:([-+]?[0-9]+)M)?(?:([-+]?[0-9]+)(?:[.,]([0-9]{0,9}))?S)?)?""", + RegexOption.IGNORE_CASE, +) + +/** + * Copy of [kotlin.time.substringWhile] (Kotlin version 2.0.20). + */ +private inline fun String.substringWhile(startIndex: Int, predicate: (Char) -> Boolean): String = + substring(startIndex, skipWhile(startIndex, predicate)) + +/** + * Copy of [kotlin.time.skipWhile] (Kotlin version 2.0.20). + */ +private inline fun String.skipWhile(startIndex: Int, predicate: (Char) -> Boolean): Int { + var i = startIndex + while (i < length && predicate(this[i])) i++ + return i +} + +/** + * Copy of [kotlin.time.durationUnitByIsoChar] (Kotlin version 2.0.20). + */ +private fun durationUnitByIsoCharOrNull(isoChar: Char, isTimeComponent: Boolean): DurationUnit? = + when { + !isTimeComponent -> { + when (isoChar) { + 'D' -> DurationUnit.DAYS + else -> null + } + } + + else -> { + when (isoChar) { + 'H' -> DurationUnit.HOURS + 'M' -> DurationUnit.MINUTES + 'S' -> DurationUnit.SECONDS + else -> null + } + } + } + +/** + * Copy of [kotlin.time.durationUnitByShortName] (Kotlin version 2.0.20). + */ +private fun durationUnitByShortNameOrNull(shortName: String): DurationUnit? = + when (shortName) { + "ns" -> DurationUnit.NANOSECONDS + "us" -> DurationUnit.MICROSECONDS + "ms" -> DurationUnit.MILLISECONDS + "s" -> DurationUnit.SECONDS + "m" -> DurationUnit.MINUTES + "h" -> DurationUnit.HOURS + "d" -> DurationUnit.DAYS + else -> null + } diff --git a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 6759d7ba4..c8577a2f9 100644 --- a/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/generated-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -1,9 +1,5 @@ package org.jetbrains.kotlinx.dataframe.impl.api -import kotlinx.coroutines.async -import kotlinx.coroutines.awaitAll -import kotlinx.coroutines.coroutineScope -import kotlinx.coroutines.runBlocking import kotlinx.datetime.Instant import kotlinx.datetime.LocalDate import kotlinx.datetime.LocalDateTime @@ -325,7 +321,7 @@ internal object Parsers : GlobalParserOptions { parser } - private val parsersOrder = listOf( + internal val parsersOrder = listOf( // Int stringParser { it.toIntOrNull() }, // Long @@ -415,7 +411,7 @@ internal object Parsers : GlobalParserOptions { stringParser { it }, ) - internal val parsersMap = parsersOrder.associateBy { it.type } + private val parsersMap = parsersOrder.associateBy { it.type } val size: Int = parsersOrder.size @@ -478,16 +474,16 @@ internal object Parsers : GlobalParserOptions { internal fun DataColumn.tryParseImpl(options: ParserOptions?): DataColumn<*> { val columnSize = size val parsedValues = ArrayList(columnSize) - var hasNulls: Boolean = false - var hasNotNulls: Boolean = false - var nullStringParsed: Boolean = false + var hasNulls = false + var hasNotNulls = false + var nullStringParsed = false val nulls = options?.nullStrings ?: Parsers.nulls - val parsersToCheck = Parsers.parsersMap - val parserTypesToCheck = parsersToCheck.keys + val parsersToCheck = Parsers.parsersOrder + val parserTypesToCheck = parsersToCheck.map { it.type }.toSet() var correctParser: StringParser<*>? = null - for ((_, parser) in parsersToCheck) { + for (parser in parsersToCheck) { if (parser.coveredBy.any { it in parserTypesToCheck }) continue val parserWithOptions = parser.applyOptions(options) @@ -496,13 +492,13 @@ internal fun DataColumn.tryParseImpl(options: ParserOptions?): DataColu hasNotNulls = false nullStringParsed = false for (str in this) { - when { - str == null -> { + when (str) { + null -> { parsedValues += null hasNulls = true } - str in nulls -> { + in nulls -> { parsedValues += null hasNulls = true nullStringParsed = true @@ -510,10 +506,7 @@ internal fun DataColumn.tryParseImpl(options: ParserOptions?): DataColu else -> { val trimmed = str.trim() - val res = parserWithOptions(trimmed) - if (res == null) { - continue - } + val res = parserWithOptions(trimmed) ?: continue parsedValues += res hasNotNulls = true } @@ -545,44 +538,32 @@ internal fun DataColumn.parse(parser: StringParser, options: Par return DataColumn.createValueColumn(name(), parsedValues, parser.type.withNullability(hasNulls)) as DataColumn } -internal fun DataFrame.parseImpl(options: ParserOptions?, columns: ColumnsSelector): DataFrame = - runBlocking { parseParallel(options, columns) } - -private suspend fun DataFrame.parseParallel( - options: ParserOptions?, - columns: ColumnsSelector, -): DataFrame = - coroutineScope { - val convertedCols = getColumnsWithPaths(columns).map { col -> - async { - when { - // when a frame column is requested to be parsed, - // parse each value/frame column at any depth inside each DataFrame in the frame column - col.isFrameColumn() -> - col.values.map { - async { - it.parseParallel(options) { - colsAtAnyDepth { !it.isColumnGroup() } - } - } - }.awaitAll() - .toColumn(col.name) - - // when a column group is requested to be parsed, - // parse each column in the group - col.isColumnGroup() -> - col.parseParallel(options) { all() } - .asColumnGroup(col.name()) - .asDataColumn() - - // Base case, parse the column if it's a `String?` column - col.isSubtypeOf() -> - col.cast().tryParse(options) - - else -> col - }.let { ColumnToInsert(col.path, it) } - } - }.awaitAll() +internal fun DataFrame.parseImpl(options: ParserOptions?, columns: ColumnsSelector): DataFrame { + val convertedCols = getColumnsWithPaths(columns).map { col -> + when { + // when a frame column is requested to be parsed, + // parse each value/frame column at any depth inside each DataFrame in the frame column + col.isFrameColumn() -> + col.values.map { + it.parseImpl(options) { + colsAtAnyDepth { !it.isColumnGroup() } + } + }.toColumn(col.name) + + // when a column group is requested to be parsed, + // parse each column in the group + col.isColumnGroup() -> + col.parseImpl(options) { all() } + .asColumnGroup(col.name()) + .asDataColumn() - emptyDataFrame().insertImpl(convertedCols) + // Base case, parse the column if it's a `String?` column + col.isSubtypeOf() -> + col.cast().tryParse(options) + + else -> col + }.let { ColumnToInsert(col.path, it) } } + + return emptyDataFrame().insertImpl(convertedCols) +} diff --git a/docs/StardustDocs/snippets/org.jetbrains.kotlinx.dataframe.samples.api.Modify.parseSome.html b/docs/StardustDocs/snippets/org.jetbrains.kotlinx.dataframe.samples.api.Modify.parseSome.html index af89584d0..5a9252ea0 100644 --- a/docs/StardustDocs/snippets/org.jetbrains.kotlinx.dataframe.samples.api.Modify.parseSome.html +++ b/docs/StardustDocs/snippets/org.jetbrains.kotlinx.dataframe.samples.api.Modify.parseSome.html @@ -183,7 +183,7 @@

- Output DataFrame: rowsCount = 7, columnsCount = 5 + Output DataFrame: rowsCount = 7, columnsCount = 2

@@ -478,13 +478,8 @@ call_DataFrame(function() { DataFrame.renderTable(0) }); /**/