Skip to content

Commit

Permalink
Automated commit of generated code
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] committed Oct 15, 2024
1 parent a8cee48 commit 117e958
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
import org.jetbrains.kotlinx.dataframe.typeClass
Expand Down Expand Up @@ -55,13 +56,39 @@ public data class ParserOptions(
}
}

/** Tries to parse a column of strings into a column of a different type.
* Each parser in [Parsers][org.jetbrains.kotlinx.dataframe.impl.api.Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried. If all the others fail, the final parser
* simply returns the original string, leaving the column unchanged.
*
* Parsers that are [covered by][org.jetbrains.kotlinx.dataframe.impl.api.StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled)
* @return a new column with parsed values */
public fun DataColumn<String?>.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options)

public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T> =
parse(options) {
colsAtAnyDepth { !it.isColumnGroup() }
}

/**
* Tries to parse a column of strings into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried.
*
* If all fail [IllegalStateException] is thrown. If you don't want this exception to be thrown,
* use [tryParse] instead.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @throws IllegalStateException if no valid parser is found
* @return a new column with parsed values
*/
public fun DataColumn<String?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") }

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package org.jetbrains.kotlinx.dataframe.impl

import kotlin.time.Duration
import kotlin.time.DurationUnit

/**
* Checks if the string can be parsed as a duration without throwing an exception.
*
* The logic is taken from [Duration.parse] (Kotlin version 2.0.20),
* so it should return the same result.
*
* @param value the string to check
*/
internal fun Duration.Companion.canParse(value: String): Boolean {
var length = value.length
if (length == 0) return false
var index = 0
val infinityString = "Infinity"
when (value[index]) {
'+', '-' -> index++
}
val hasSign = index > 0
when {
length <= index -> return false

value[index] == 'P' -> {
if (++index == length) return false
val nonDigitSymbols = "+-."
var isTimeComponent = false
var prevUnit: DurationUnit? = null
while (index < length) {
if (value[index] == 'T') {
if (isTimeComponent || ++index == length) return false
isTimeComponent = true
continue
}
val component = value.substringWhile(index) { it in '0'..'9' || it in nonDigitSymbols }
if (component.isEmpty()) return false
index += component.length
val unitChar = value.getOrElse(index) { return false }
index++
val unit = durationUnitByIsoCharOrNull(unitChar, isTimeComponent) ?: return false
if (prevUnit != null && prevUnit <= unit) return false
prevUnit = unit
}
}

value.regionMatches(
thisOffset = index,
other = infinityString,
otherOffset = 0,
length = maxOf(length - index, infinityString.length),
ignoreCase = true,
) -> return true

else -> {
// parse default string format
var prevUnit: DurationUnit? = null
var afterFirst = false
var allowSpaces = !hasSign
if (hasSign && value[index] == '(' && value.last() == ')') {
allowSpaces = true
if (++index == --length) return false
}
while (index < length) {
if (afterFirst && allowSpaces) {
index = value.skipWhile(index) { it == ' ' }
}
afterFirst = true
val component = value.substringWhile(index) { it in '0'..'9' || it == '.' }
if (component.isEmpty()) return false
index += component.length
val unitName = value.substringWhile(index) { it in 'a'..'z' }
index += unitName.length
val unit = durationUnitByShortNameOrNull(unitName) ?: return false
if (prevUnit != null && prevUnit <= unit) return false
prevUnit = unit
val dotIndex = component.indexOf('.')
if (dotIndex > 0) {
if (index < length) return false
}
}
}
}
return true
}

/**
* Checks if the string can be parsed as a java duration without throwing an exception.
*/
internal fun javaDurationCanParse(value: String): Boolean = isoDurationRegex.matches(value)

/**
* regex from [java.time.Duration.Lazy.PATTERN], it represents the ISO-8601 duration format.
*/
private val isoDurationRegex = Regex(
"""([-+]?)P(?:([-+]?[0-9]+)D)?(T(?:([-+]?[0-9]+)H)?(?:([-+]?[0-9]+)M)?(?:([-+]?[0-9]+)(?:[.,]([0-9]{0,9}))?S)?)?""",
RegexOption.IGNORE_CASE,
)

/**
* Copy of [kotlin.time.substringWhile] (Kotlin version 2.0.20).
*/
private inline fun String.substringWhile(startIndex: Int, predicate: (Char) -> Boolean): String =
substring(startIndex, skipWhile(startIndex, predicate))

/**
* Copy of [kotlin.time.skipWhile] (Kotlin version 2.0.20).
*/
private inline fun String.skipWhile(startIndex: Int, predicate: (Char) -> Boolean): Int {
var i = startIndex
while (i < length && predicate(this[i])) i++
return i
}

/**
* Copy of [kotlin.time.durationUnitByIsoChar] (Kotlin version 2.0.20).
*/
private fun durationUnitByIsoCharOrNull(isoChar: Char, isTimeComponent: Boolean): DurationUnit? =
when {
!isTimeComponent -> {
when (isoChar) {
'D' -> DurationUnit.DAYS
else -> null
}
}

else -> {
when (isoChar) {
'H' -> DurationUnit.HOURS
'M' -> DurationUnit.MINUTES
'S' -> DurationUnit.SECONDS
else -> null
}
}
}

/**
* Copy of [kotlin.time.durationUnitByShortName] (Kotlin version 2.0.20).
*/
private fun durationUnitByShortNameOrNull(shortName: String): DurationUnit? =
when (shortName) {
"ns" -> DurationUnit.NANOSECONDS
"us" -> DurationUnit.MICROSECONDS
"ms" -> DurationUnit.MILLISECONDS
"s" -> DurationUnit.SECONDS
"m" -> DurationUnit.MINUTES
"h" -> DurationUnit.HOURS
"d" -> DurationUnit.DAYS
else -> null
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
package org.jetbrains.kotlinx.dataframe.impl.api

import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.runBlocking
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalDate
import kotlinx.datetime.LocalDateTime
Expand Down Expand Up @@ -325,7 +321,7 @@ internal object Parsers : GlobalParserOptions {
parser
}

private val parsersOrder = listOf(
internal val parsersOrder = listOf(
// Int
stringParser<Int> { it.toIntOrNull() },
// Long
Expand Down Expand Up @@ -415,7 +411,7 @@ internal object Parsers : GlobalParserOptions {
stringParser<String> { it },
)

internal val parsersMap = parsersOrder.associateBy { it.type }
private val parsersMap = parsersOrder.associateBy { it.type }

val size: Int = parsersOrder.size

Expand Down Expand Up @@ -478,16 +474,16 @@ internal object Parsers : GlobalParserOptions {
internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColumn<*> {
val columnSize = size
val parsedValues = ArrayList<Any?>(columnSize)
var hasNulls: Boolean = false
var hasNotNulls: Boolean = false
var nullStringParsed: Boolean = false
var hasNulls = false
var hasNotNulls = false
var nullStringParsed = false
val nulls = options?.nullStrings ?: Parsers.nulls

val parsersToCheck = Parsers.parsersMap
val parserTypesToCheck = parsersToCheck.keys
val parsersToCheck = Parsers.parsersOrder
val parserTypesToCheck = parsersToCheck.map { it.type }.toSet()

var correctParser: StringParser<*>? = null
for ((_, parser) in parsersToCheck) {
for (parser in parsersToCheck) {
if (parser.coveredBy.any { it in parserTypesToCheck }) continue

val parserWithOptions = parser.applyOptions(options)
Expand All @@ -496,24 +492,21 @@ internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColu
hasNotNulls = false
nullStringParsed = false
for (str in this) {
when {
str == null -> {
when (str) {
null -> {
parsedValues += null
hasNulls = true
}

str in nulls -> {
in nulls -> {
parsedValues += null
hasNulls = true
nullStringParsed = true
}

else -> {
val trimmed = str.trim()
val res = parserWithOptions(trimmed)
if (res == null) {
continue
}
val res = parserWithOptions(trimmed) ?: continue
parsedValues += res
hasNotNulls = true
}
Expand Down Expand Up @@ -545,44 +538,32 @@ internal fun <T> DataColumn<String?>.parse(parser: StringParser<T>, options: Par
return DataColumn.createValueColumn(name(), parsedValues, parser.type.withNullability(hasNulls)) as DataColumn<T?>
}

internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: ColumnsSelector<T, Any?>): DataFrame<T> =
runBlocking { parseParallel(options, columns) }

private suspend fun <T> DataFrame<T>.parseParallel(
options: ParserOptions?,
columns: ColumnsSelector<T, Any?>,
): DataFrame<T> =
coroutineScope {
val convertedCols = getColumnsWithPaths(columns).map { col ->
async {
when {
// when a frame column is requested to be parsed,
// parse each value/frame column at any depth inside each DataFrame in the frame column
col.isFrameColumn() ->
col.values.map {
async {
it.parseParallel(options) {
colsAtAnyDepth { !it.isColumnGroup() }
}
}
}.awaitAll()
.toColumn(col.name)

// when a column group is requested to be parsed,
// parse each column in the group
col.isColumnGroup() ->
col.parseParallel(options) { all() }
.asColumnGroup(col.name())
.asDataColumn()

// Base case, parse the column if it's a `String?` column
col.isSubtypeOf<String?>() ->
col.cast<String?>().tryParse(options)

else -> col
}.let { ColumnToInsert(col.path, it) }
}
}.awaitAll()
internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: ColumnsSelector<T, Any?>): DataFrame<T> {
val convertedCols = getColumnsWithPaths(columns).map { col ->
when {
// when a frame column is requested to be parsed,
// parse each value/frame column at any depth inside each DataFrame in the frame column
col.isFrameColumn() ->
col.values.map {
it.parseImpl(options) {
colsAtAnyDepth { !it.isColumnGroup() }
}
}.toColumn(col.name)

// when a column group is requested to be parsed,
// parse each column in the group
col.isColumnGroup() ->
col.parseImpl(options) { all() }
.asColumnGroup(col.name())
.asDataColumn()

emptyDataFrame<T>().insertImpl(convertedCols)
// Base case, parse the column if it's a `String?` column
col.isSubtypeOf<String?>() ->
col.cast<String?>().tryParse(options)

else -> col
}.let { ColumnToInsert(col.path, it) }
}

return emptyDataFrame<T>().insertImpl(convertedCols)
}
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
<p class="dataframe_description"></p>
</details>
<details>
<summary>Output DataFrame: rowsCount = 7, columnsCount = 5</summary>
<summary>Output DataFrame: rowsCount = 7, columnsCount = 2</summary>
<table class="dataframe" id="df_1"></table>

<p class="dataframe_description"></p>
Expand Down Expand Up @@ -478,13 +478,8 @@
call_DataFrame(function() { DataFrame.renderTable(0) });

/*<!--*/
call_DataFrame(function() { DataFrame.addTable({ cols: [{ name: "<span title=\"firstName: String\">firstName</span>", children: [], rightAlign: false, values: ["Alice","Bob","Charlie","Charlie","Bob","Alice","Charlie"] },
{ name: "<span title=\"lastName: String\">lastName</span>", children: [], rightAlign: false, values: ["Cooper","Dylan","Daniels","Chaplin","Marley","Wolf","Byrd"] },
{ name: "<span title=\"name: DataRow<*>\">name</span>", children: [0, 1], rightAlign: false, values: ["<span class=\"formatted\" title=\"firstName: Alice\nlastName: Cooper\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Alice<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Cooper<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Bob\nlastName: Dylan\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Bob<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Dylan<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Daniels\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Dan<span class=\"structural\">...</span><span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Chaplin\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Cha<span class=\"structural\">...</span><span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Bob\nlastName: Marley\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Bob<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Marley<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Alice\nlastName: Wolf\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Alice<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Wolf<span class=\"structural\"> }</span></span>","<span class=\"formatted\" title=\"firstName: Charlie\nlastName: Byrd\"><span class=\"structural\">{ </span><span class=\"structural\">firstName: </span>Charlie<span class=\"structural\">, </span><span class=\"structural\">lastName: </span>Byrd<span class=\"structural\"> }</span></span>"] },
{ name: "<span title=\"age: Int\">age</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">15</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">45</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">40</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>"] },
{ name: "<span title=\"city: String?\">city</span>", children: [], rightAlign: false, values: ["London","Dubai","Moscow","Milan","Tokyo","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","Moscow"] },
call_DataFrame(function() { DataFrame.addTable({ cols: [{ name: "<span title=\"age: Int\">age</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">15</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">45</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">40</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">20</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">30</span></span>"] },
{ name: "<span title=\"weight: Int?\">weight</span>", children: [], rightAlign: true, values: ["<span class=\"formatted\" title=\"\"><span class=\"numbers\">54</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">87</span></span>","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","<span class=\"formatted\" title=\"\"><span class=\"null\">null</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">68</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">55</span></span>","<span class=\"formatted\" title=\"\"><span class=\"numbers\">90</span></span>"] },
{ name: "<span title=\"isHappy: Boolean\">isHappy</span>", children: [], rightAlign: false, values: ["true","true","false","true","true","false","true"] },
], id: 1, rootId: 1, totalRows: 7 } ) });
/*-->*/

Expand Down

0 comments on commit 117e958

Please sign in to comment.