Skip to content

Commit

Permalink
refactored suggestedType+guessTypeWithSuggestedAsUpperbound into Type…
Browse files Browse the repository at this point in the history
…Suggestion, working on feedback
  • Loading branch information
Jolanrensen committed Oct 25, 2024
1 parent 42e2d67 commit dfaf46e
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnResolutionContext
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnGroupImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
Expand Down Expand Up @@ -73,11 +74,10 @@ public interface DataColumn<out T> : BaseColumn<T> {
/**
* Creates [ValueColumn] using given [name], [values] and reified column [type].
*
* Be careful; values are NOT checked to adhere to [type] for efficiency,
* The column [type] will be defined at compile-time using [T] argument.
* Be careful with casting; values are NOT checked to adhere to `reified` type [T] for efficiency,
* unless you specify [infer].
*
* Note, that column [type] will be defined at compile-time using [T] argument
*
* @param T type of the column
* @param name name of the column
* @param values list of column values
Expand Down Expand Up @@ -114,7 +114,8 @@ public interface DataColumn<out T> : BaseColumn<T> {
/**
* Creates [FrameColumn] using the given [name] and list of dataframes [groups].
*
* Be careful; [groups] must be a non-null list of [DataFrames][DataFrame].
* [groups] must be a non-null list of [DataFrames][DataFrame], as [FrameColumn] does
* not allow `null` values.
* This is NOT checked at runtime for efficiency, nor is the validity of given [schema].
*
* @param name name of the frame column
Expand All @@ -141,21 +142,21 @@ public interface DataColumn<out T> : BaseColumn<T> {
*
* @param name name of the column
* @param values the values to represent each row in the column
* @param suggestedType optional suggested type for values. Default is [TypeSuggestion.Infer].
* See [TypeSuggestion] for more information.
* @param nullable optionally you can specify whether [values] contains nulls, if `null` it is inferred.
* @param allColsMakesColGroup if `true`, then, if all values are non-null same-sized columns,
* a column group will be created instead of a [DataColumn][DataColumn]`<`[AnyCol][AnyCol]`>`.
*/
public fun <T> createWithTypeInference(
name: String,
values: List<T>,
suggestedType: TypeSuggestion = TypeSuggestion.Infer,
nullable: Boolean? = null,
allColsMakesColGroup: Boolean = false,
): DataColumn<T> =
createColumnGuessingType(
name = name,
values = values,
suggestedType = suggestedType,
nullable = nullable,
allColsMakesColGroup = allColsMakesColGroup,
)

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.exceptions.DuplicateColumnNamesException
import org.jetbrains.kotlinx.dataframe.exceptions.UnequalColumnSizesException
import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator
Expand Down Expand Up @@ -225,8 +226,7 @@ public class ColumnDelegate<T>(private val parent: ColumnGroupReference? = null)
public inline fun <reified T> columnOf(vararg values: T): DataColumn<T> =
createColumnGuessingType(
values = values.asIterable(),
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = true,
suggestedType = TypeSuggestion.InferWithUpperbound(typeOf<T>()),
listifyValues = false,
allColsMakesColGroup = true,
).forceResolve()
Expand All @@ -252,8 +252,7 @@ public fun <T> columnOf(frames: Iterable<DataFrame<T>>): FrameColumn<T> =
public inline fun <reified T> column(values: Iterable<T>): DataColumn<T> =
createColumnGuessingType(
values = values,
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = false,
suggestedType = TypeSuggestion.Use(typeOf<T>()),
allColsMakesColGroup = true,
).forceResolve()

Expand Down Expand Up @@ -305,8 +304,7 @@ public inline fun <T, reified C> dataFrameOf(header: Iterable<T>, fill: (T) -> I
createColumnGuessingType(
name = value.toString(),
values = fill(value).asList(),
suggestedType = typeOf<C>(),
guessTypeWithSuggestedAsUpperbound = true,
suggestedType = TypeSuggestion.InferWithUpperbound(typeOf<C>()),
)
}.toDataFrame()

Expand Down Expand Up @@ -346,8 +344,7 @@ public class DataFrameBuilder(private val header: List<String>) {
createColumnGuessingType(
name = name,
values = valuesBuilder(name).asList(),
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = true,
suggestedType = TypeSuggestion.InferWithUpperbound(typeOf<T>()),
)
}

Expand Down Expand Up @@ -387,12 +384,12 @@ public class DataFrameBuilder(private val header: List<String>) {
)
}

private inline fun <reified C> fillNotNull(nrow: Int, crossinline init: (Int) -> C & Any) =
private inline fun <reified C : Any> fillNotNull(nrow: Int, crossinline init: (Int) -> C) =
withColumns { name ->
DataColumn.createValueColumn(
name = name,
values = List(nrow, init),
type = typeOf<C>().withNullability(false),
type = typeOf<C>(),
)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@ import org.jetbrains.kotlinx.dataframe.ColumnsSelector
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.type
import kotlin.reflect.KProperty

public fun AnyCol.inferType(): DataColumn<*> = createColumnGuessingType(name, toList(), type, true)
public fun AnyCol.inferType(): DataColumn<*> =
createColumnGuessingType(
name = name,
values = toList(),
suggestedType = TypeSuggestion.InferWithUpperbound(type),
)

// region DataFrame

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,14 @@ public inline fun <reified T> Iterable<T>.toValueColumn(column: KProperty<T>): V
public enum class Infer {

/**
* Use reified type argument of an inline [DataFrame] operation as [DataColumn.type].
* Use `reified` type argument of an inline [DataFrame] operation as [DataColumn.type].
*
* This is the most efficient but least safe option.
*/
None,

/**
* Use reified type argument of an inline [DataFrame] operation as [DataColumn.type],
* Use `reified` type argument of an inline [DataFrame] operation as [DataColumn.type],
* but compute [DataColumn.hasNulls] by checking column [DataColumn.values] for an actual presence of `null` values.
*/
Nulls,
Expand All @@ -250,6 +250,10 @@ public enum class Infer {
* base type as an upper bound.
*
* This is the least efficient but safest option.
*
* It's useful, for instance,
* if you have a column of type `Any?` and want its schema type to be inferred based on the actual values.
* In many cases, letting the library infer by `reified` types is enough and more efficient.
*/
Type,

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package org.jetbrains.kotlinx.dataframe.columns

import kotlin.reflect.KType

/**
* The suggestion of how to find a column type.
*
* The suggestion can either be:
*
* - [Infer] - {@include [Infer]}
* - [InferWithUpperbound] - {@include [InferWithUpperbound]}
* - [Use] - {@include [Use]}
*
* It can be either an [exact type][Use] or an [upper bound][InferWithUpperbound] of possible types
* after which the library will infer the exact type.
*/
public sealed interface TypeSuggestion {

public companion object {

/** Creates a new [TypeSuggestion] instance based on the given parameters. */
public fun create(suggestedType: KType?, guessType: Boolean): TypeSuggestion =
when {
suggestedType != null && guessType -> InferWithUpperbound(suggestedType)
suggestedType != null && !guessType -> Use(suggestedType)
suggestedType == null && guessType -> Infer
else -> error("Cannot create TypeSuggestion with no suggested type and no guessing allowed.")
}
}

/** The library will try to infer the type by checking all the values. */
public data object Infer : TypeSuggestion

/** The library will infer the type by checking all the values taking a given upper bound into account. */
public data class InferWithUpperbound(val upperbound: KType) : TypeSuggestion

/** The library will use the specified type without inference. */
public data class Use(val type: KType) : TypeSuggestion
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.jetbrains.kotlinx.dataframe.aggregation.NamedValue
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.isComparable
import org.jetbrains.kotlinx.dataframe.api.isNumber
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType

internal inline fun <T> Aggregatable<T>.remainingColumns(
Expand All @@ -22,7 +23,6 @@ internal fun NamedValue.toColumnWithPath() =
path to createColumnGuessingType(
name = path.last(),
values = listOf(value),
suggestedType = type,
guessTypeWithSuggestedAsUpperbound = guessType,
suggestedType = TypeSuggestion.create(type, guessType),
defaultValue = default,
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame
import org.jetbrains.kotlinx.dataframe.api.isColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.impl.commonType
Expand Down Expand Up @@ -76,8 +77,7 @@ internal fun <T> concatImpl(name: String, columns: List<DataColumn<T>?>, columnS
return createColumnGuessingType(
name = name,
values = list,
suggestedType = tartypeOf,
guessTypeWithSuggestedAsUpperbound = guessType,
suggestedType = TypeSuggestion.create(tartypeOf, guessType),
defaultValue = defaultValue,
).cast()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn
import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf
import org.jetbrains.kotlinx.dataframe.api.toColumn
import org.jetbrains.kotlinx.dataframe.api.tryParse
import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
import org.jetbrains.kotlinx.dataframe.columns.size
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.canParse
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
import org.jetbrains.kotlinx.dataframe.io.isURL
Expand Down Expand Up @@ -529,7 +529,11 @@ internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColu

// Create a new column with the parsed values,
// createColumnGuessingType is used to handle unifying values if needed
return createColumnGuessingType(name(), parsedValues, type)
return DataColumn.createWithTypeInference(
name = name(),
values = parsedValues,
suggestedType = TypeSuggestion.Use(type),
)
}

internal fun <T> DataColumn<String?>.parse(parser: StringParser<T>, options: ParserOptions?): DataColumn<T?> {
Expand All @@ -539,7 +543,11 @@ internal fun <T> DataColumn<String?>.parse(parser: StringParser<T>, options: Par
handler(it.trim()) ?: throw IllegalStateException("Couldn't parse '$it' into type ${parser.type}")
}
}
return createColumnGuessingType(name(), parsedValues, parser.type.withNullability(hasNulls))
return DataColumn.createWithTypeInference(
name = name(),
values = parsedValues,
suggestedType = TypeSuggestion.Use(parser.type.withNullability(hasNulls)),
)
}

internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: ColumnsSelector<T, Any?>): DataFrame<T> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ internal fun convertToDataFrame(
val shouldCreateColumnGroup = kClass == DataRow::class

when {
hasExceptions -> DataColumn.createWithTypeInference(it.columnName, values, nullable)
hasExceptions -> DataColumn.createWithTypeInference(it.columnName, values, nullable = nullable)

shouldCreateValueCol ->
DataColumn.createValueColumn(
Expand Down
Loading

0 comments on commit dfaf46e

Please sign in to comment.