Skip to content

Commit

Permalink
some implementation refactoring of csv
Browse files Browse the repository at this point in the history
  • Loading branch information
Jolanrensen committed Oct 22, 2024
1 parent c811d70 commit ae8ce9c
Show file tree
Hide file tree
Showing 466 changed files with 93,899 additions and 84 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package org.jetbrains.kotlinx.dataframe

import org.jetbrains.kotlinx.dataframe.api.ColumnSelectionDsl
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.castFrameColumn
import org.jetbrains.kotlinx.dataframe.api.getColumn
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.impl.columnName
import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
import kotlin.reflect.KProperty

/**
* Provides access to [columns][DataColumn].
*
* Base interface for [DataFrame] and [ColumnSelectionDsl]
*
* @param T Schema marker. Used to generate extension properties for typed column access.
*/
public interface ColumnsContainer<out T> {

// region columns

public fun columns(): List<AnyCol>

public fun columnsCount(): Int

public fun containsColumn(name: String): Boolean

public fun containsColumn(path: ColumnPath): Boolean

public fun getColumnIndex(name: String): Int

// endregion

// region getColumnOrNull

public fun getColumnOrNull(name: String): AnyCol?

public fun getColumnOrNull(index: Int): AnyCol?

public fun <R> getColumnOrNull(column: ColumnReference<R>): DataColumn<R>?

public fun <R> getColumnOrNull(column: KProperty<R>): DataColumn<R>?

public fun getColumnOrNull(path: ColumnPath): AnyCol?

public fun <R> getColumnOrNull(column: ColumnSelector<T, R>): DataColumn<R>?

// endregion

// region get

public operator fun get(columnName: String): AnyCol = getColumn(columnName)

public operator fun get(columnPath: ColumnPath): AnyCol = getColumn(columnPath)

public operator fun <R> get(column: DataColumn<R>): DataColumn<R> = getColumn(column.name()).cast()

public operator fun <R> get(column: DataColumn<DataRow<R>>): ColumnGroup<R> = getColumn(column)

public operator fun <R> get(column: DataColumn<DataFrame<R>>): FrameColumn<R> = getColumn(column)

public operator fun <R> get(column: ColumnReference<R>): DataColumn<R> = getColumn(column)

public operator fun <R> get(column: ColumnReference<DataRow<R>>): ColumnGroup<R> = getColumn(column)

public operator fun <R> get(column: ColumnReference<DataFrame<R>>): FrameColumn<R> = getColumn(column)

public operator fun <R> get(column: KProperty<R>): DataColumn<R> = get(column.columnName).cast()

public operator fun <R> get(column: KProperty<DataRow<R>>): ColumnGroup<R> =
get(column.columnName).asColumnGroup().cast()

public operator fun <R> get(column: KProperty<DataFrame<R>>): FrameColumn<R> =
get(column.columnName).asAnyFrameColumn().castFrameColumn()

public fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>>

public fun <C> get(column: ColumnSelector<T, C>): DataColumn<C> = get(column as ColumnsSelector<T, C>).single()

// endregion
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package org.jetbrains.kotlinx.dataframe

import org.jetbrains.kotlinx.dataframe.api.Infer
import org.jetbrains.kotlinx.dataframe.api.asDataColumn
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.concat
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.map
import org.jetbrains.kotlinx.dataframe.api.schema
import org.jetbrains.kotlinx.dataframe.api.take
import org.jetbrains.kotlinx.dataframe.columns.BaseColumn
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnResolutionContext
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnGroupImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
import org.jetbrains.kotlinx.dataframe.impl.columns.guessColumnType
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnKind
import org.jetbrains.kotlinx.dataframe.impl.getValuesType
import org.jetbrains.kotlinx.dataframe.impl.splitByIndices
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import kotlin.reflect.KClass
import kotlin.reflect.KProperty
import kotlin.reflect.KType
import kotlin.reflect.typeOf

/**
* Column with [name] and [values] of specific [type].
*
* Base interface for [ValueColumn] and [FrameColumn], but not for [ColumnGroup]. However, implementations for all three [column kinds][ColumnKind] derive from DataColumn and can cast to it safely.
* Column operations that have signature clash with [DataFrame] API ([filter], [take], [map] etc.) are defined for [DataColumn] and not for [BaseColumn].
*
* @param T type of values in the column.
*/
public interface DataColumn<out T> : BaseColumn<T> {

public companion object {

/**
* Creates [ValueColumn] using given [name], [values] and [type].
*
* @param name name of the column
* @param values list of column values
* @param type type of the column
* @param infer column type inference mode
*/
public fun <T> createValueColumn(
name: String,
values: List<T>,
type: KType,
infer: Infer = Infer.None,
defaultValue: T? = null,
): ValueColumn<T> = ValueColumnImpl(values, name, getValuesType(values, type, infer), defaultValue)

/**
* Creates [ValueColumn] using given [name], [values] and reified column [type].
*
* Note, that column [type] will be defined at compile-time using [T] argument
*
* @param T type of the column
* @param name name of the column
* @param values list of column values
* @param infer column type inference mode
*/
public inline fun <reified T> createValueColumn(
name: String,
values: List<T>,
infer: Infer = Infer.None,
): ValueColumn<T> =
createValueColumn(
name,
values,
getValuesType(
values,
typeOf<T>(),
infer,
),
)

public fun <T> createColumnGroup(name: String, df: DataFrame<T>): ColumnGroup<T> = ColumnGroupImpl(name, df)

public fun <T> createFrameColumn(name: String, df: DataFrame<T>, startIndices: Iterable<Int>): FrameColumn<T> =
FrameColumnImpl(name, df.splitByIndices(startIndices.asSequence()).toList(), lazy { df.schema() })

public fun <T> createFrameColumn(
name: String,
groups: List<DataFrame<T>>,
schema: Lazy<DataFrameSchema>? = null,
): FrameColumn<T> = FrameColumnImpl(name, groups, schema)

public fun <T> createWithTypeInference(
name: String,
values: List<T>,
nullable: Boolean? = null,
): DataColumn<T> = guessColumnType(name, values, nullable = nullable)

public fun <T> create(
name: String,
values: List<T>,
type: KType,
infer: Infer = Infer.None,
): DataColumn<T> =
when (type.toColumnKind()) {
ColumnKind.Value -> createValueColumn(name, values, type, infer)
ColumnKind.Group -> createColumnGroup(name, (values as List<AnyRow?>).concat()).asDataColumn().cast()
ColumnKind.Frame -> createFrameColumn(name, values as List<AnyFrame>).asDataColumn().cast()
}

public inline fun <reified T> create(name: String, values: List<T>, infer: Infer = Infer.None): DataColumn<T> =
create(name, values, typeOf<T>(), infer)

public fun empty(name: String = ""): AnyCol = createValueColumn(name, emptyList<Unit>(), typeOf<Unit>())
}

public fun hasNulls(): Boolean = type().isMarkedNullable

override fun distinct(): DataColumn<T>

override fun get(indices: Iterable<Int>): DataColumn<T>

override fun rename(newName: String): DataColumn<T>

override fun resolveSingle(context: ColumnResolutionContext): ColumnWithPath<T>? = this.addPath()

override operator fun getValue(thisRef: Any?, property: KProperty<*>): DataColumn<T> =
super.getValue(thisRef, property) as DataColumn<T>

public operator fun iterator(): Iterator<T> = values().iterator()

public override operator fun get(range: IntRange): DataColumn<T>
}

public val AnyCol.name: String get() = name()
public val AnyCol.path: ColumnPath get() = path()

public val <T> DataColumn<T>.values: Iterable<T> get() = values()
public val AnyCol.hasNulls: Boolean get() = hasNulls()
public val AnyCol.size: Int get() = size()
public val AnyCol.indices: IntRange get() = indices()

public val AnyCol.type: KType get() = type()
public val AnyCol.kind: ColumnKind get() = kind()
public val AnyCol.typeClass: KClass<*>
get() = type.classifier as? KClass<*>
?: error("Cannot cast ${type.classifier?.javaClass} to a ${KClass::class}. Column $name: $type")

public fun AnyBaseCol.indices(): IntRange = 0 until size()
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package org.jetbrains.kotlinx.dataframe

import org.jetbrains.kotlinx.dataframe.aggregation.Aggregatable
import org.jetbrains.kotlinx.dataframe.aggregation.AggregateGroupedBody
import org.jetbrains.kotlinx.dataframe.annotations.HasSchema
import org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.getRows
import org.jetbrains.kotlinx.dataframe.api.indices
import org.jetbrains.kotlinx.dataframe.api.rows
import org.jetbrains.kotlinx.dataframe.api.select
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.DataFrameImpl
import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
import org.jetbrains.kotlinx.dataframe.impl.getColumnsImpl
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
import org.jetbrains.kotlinx.dataframe.impl.headPlusIterable
import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrame
import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrameOf
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import kotlin.reflect.KType

/**
* Readonly interface for an ordered list of [columns][DataColumn].
*
* Columns in `DataFrame` have distinct non-empty [names][DataColumn.name] and equal [sizes][DataColumn.size].
*
* @param T Schema marker. It identifies column schema and is used to generate schema-specific extension properties for typed data access. It is covariant, so `DataFrame<A>` is assignable to variable of type `DataFrame<B>` if `A` is a subtype of `B`.
*/
@HasSchema(schemaArg = 0)
public interface DataFrame<out T> :
Aggregatable<T>,
ColumnsContainer<T> {

public companion object {
public val Empty: AnyFrame = DataFrameImpl<Unit>(emptyList(), 0)

public fun empty(nrow: Int = 0): AnyFrame = if (nrow == 0) Empty else DataFrameImpl<Unit>(emptyList(), nrow)

/**
* Creates a DataFrame with empty columns (rows = 0).
* Can be used as a "null object" in aggregation operations, operations that work on columns (select, reorder, ...)
*
*/
public inline fun <reified T> emptyOf(): DataFrame<T> = createEmptyDataFrameOf(T::class).cast()

/**
* Creates a DataFrame with empty columns (rows = 0).
* Can be used as a "null object" in aggregation operations, operations that work on columns (select, reorder, ...)
*/
public fun empty(schema: DataFrameSchema): AnyFrame = schema.createEmptyDataFrame()
}

// region columns

public fun columnNames(): List<String>

public fun columnTypes(): List<KType>

// endregion

// region rows

public fun rowsCount(): Int

public operator fun iterator(): Iterator<DataRow<T>> = rows().iterator()

// endregion

public fun <R> aggregate(body: AggregateGroupedBody<T, R>): DataRow<T>

// region get columns

/**
* Returns a list of columns selected by [columns], a [ColumnsSelectionDsl].
*
* NOTE: This doesn't work in [ColumnsSelectionDsl], use [ColumnsSelectionDsl.cols] to select columns by predicate.
*/
override fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> =
getColumnsImpl(UnresolvedColumnsPolicy.Fail, columns)

// endregion

// region get rows

public operator fun get(index: Int): DataRow<T>

public operator fun get(indices: Iterable<Int>): DataFrame<T> = getRows(indices)

public operator fun get(range: IntRange): DataFrame<T> = getRows(range)

public operator fun get(first: IntRange, vararg ranges: IntRange): DataFrame<T> =
getRows(headPlusArray(first, ranges).asSequence().flatMap { it.asSequence() }.asIterable())

public operator fun get(firstIndex: Int, vararg otherIndices: Int): DataFrame<T> =
get(headPlusIterable(firstIndex, otherIndices.asIterable()))

// endregion

// region plus columns

public operator fun plus(col: AnyBaseCol): DataFrame<T> = add(col)

public operator fun plus(cols: Iterable<AnyBaseCol>): DataFrame<T> = (columns() + cols).toDataFrame().cast()

// endregion
}

// region get columns

/**
* Returns a list of columns selected by [columns], a [ColumnsSelectionDsl].
*/
public operator fun <T, C> DataFrame<T>.get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> = this.get(columns)

public operator fun <T> DataFrame<T>.get(first: AnyColumnReference, vararg other: AnyColumnReference): DataFrame<T> =
select { (listOf(first) + other).toColumnSet() }

public operator fun <T> DataFrame<T>.get(first: String, vararg other: String): DataFrame<T> =
select { (listOf(first) + other).toColumnSet() }

public operator fun <T> DataFrame<T>.get(columnRange: ClosedRange<String>): DataFrame<T> =
select { columnRange.start..columnRange.endInclusive }

// endregion

internal val ColumnsContainer<*>.ncol get() = columnsCount()
internal val AnyFrame.nrow get() = rowsCount()
internal val AnyFrame.indices get() = indices()
internal val AnyFrame.size: DataFrameSize get() = size()

public fun AnyFrame.size(): DataFrameSize = DataFrameSize(ncol, nrow)
Loading

0 comments on commit ae8ce9c

Please sign in to comment.