Skip to content

Commit

Permalink
Add separator parameter to DataFrame.flatten (#667)
Browse files Browse the repository at this point in the history
Added a 'separator' parameter to the DataFrame.flatten function to customize the separator used in column names when 'keepParentNameForColumns' is true. This allows greater flexibility in formatting column names. Tests have been updated accordingly to check for proper functionality.
  • Loading branch information
zaleslaw authored Apr 19, 2024
1 parent bcf6e64 commit 41577df
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,33 @@ import kotlin.reflect.KProperty

// region DataFrame

public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { all() }

public fun <T, C> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns)

public fun <T> DataFrame<T>.flatten(vararg columns: String, keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }
public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, separator: String = "."): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { all() }

public fun <T, C> DataFrame<T>.flatten(
keepParentNameForColumns: Boolean = false,
separator: String = ".",
columns: ColumnsSelector<T, C>
): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns, separator)

public fun <T> DataFrame<T>.flatten(
vararg columns: String,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> = flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(
vararg columns: ColumnReference<C>,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(
vararg columns: KProperty<C>,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

// endregion
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnSet

internal fun <T, C> DataFrame<T>.flattenImpl(
columns: ColumnsSelector<T, C>,
keepParentNameForColumns: Boolean = false
keepParentNameForColumns: Boolean = false,
separator: String = ".",
): DataFrame<T> {
val rootColumns = getColumnsWithPaths {
columns.toColumnSet().filter { it.isColumnGroup() }.simplify()
Expand All @@ -32,7 +33,7 @@ internal fun <T, C> DataFrame<T>.flattenImpl(
.into {
val targetPath = getRootPrefix(it.path).dropLast(1)
val nameGen = nameGenerators[targetPath]!!
val preferredName = if (keepParentNameForColumns) "${it.name()}.${it.parentName}" else it.name()
val preferredName = if (keepParentNameForColumns) "${it.parentName}${separator}${it.name()}" else it.name()
val name = nameGen.addUnique(preferredName)
targetPath + name
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,17 @@ class FlattenTests {

aggregate
.flatten(keepParentNameForColumns = true)
.columnNames() shouldBe listOf("city", "age.mean", "weight.mean", "age.std", "weight.std")
.columnNames() shouldBe listOf("city", "mean.age", "mean.weight", "std.age", "std.weight")

aggregate
.flatten(keepParentNameForColumns = true, separator = "_happy_separator_")
.columnNames() shouldBe listOf(
"city",
"mean_happy_separator_age",
"mean_happy_separator_weight",
"std_happy_separator_age",
"std_happy_separator_weight"
)
}

@DataSchema
Expand Down
39 changes: 28 additions & 11 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/flatten.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,33 @@ import kotlin.reflect.KProperty

// region DataFrame

public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { all() }

public fun <T, C> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, columns: ColumnsSelector<T, C>): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns)

public fun <T> DataFrame<T>.flatten(vararg columns: String, keepParentNameForColumns: Boolean = false): DataFrame<T> = flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: ColumnReference<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(vararg columns: KProperty<C>, keepParentNameForColumns: Boolean = false): DataFrame<T> =
flatten(keepParentNameForColumns) { columns.toColumnSet() }
public fun <T> DataFrame<T>.flatten(keepParentNameForColumns: Boolean = false, separator: String = "."): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { all() }

public fun <T, C> DataFrame<T>.flatten(
keepParentNameForColumns: Boolean = false,
separator: String = ".",
columns: ColumnsSelector<T, C>
): DataFrame<T> = flattenImpl(columns, keepParentNameForColumns, separator)

public fun <T> DataFrame<T>.flatten(
vararg columns: String,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> = flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(
vararg columns: ColumnReference<C>,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.flatten(
vararg columns: KProperty<C>,
keepParentNameForColumns: Boolean = false,
separator: String = "."
): DataFrame<T> =
flatten(keepParentNameForColumns, separator) { columns.toColumnSet() }

// endregion
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnSet

internal fun <T, C> DataFrame<T>.flattenImpl(
columns: ColumnsSelector<T, C>,
keepParentNameForColumns: Boolean = false
keepParentNameForColumns: Boolean = false,
separator: String = ".",
): DataFrame<T> {
val rootColumns = getColumnsWithPaths {
columns.toColumnSet().filter { it.isColumnGroup() }.simplify()
Expand All @@ -32,7 +33,7 @@ internal fun <T, C> DataFrame<T>.flattenImpl(
.into {
val targetPath = getRootPrefix(it.path).dropLast(1)
val nameGen = nameGenerators[targetPath]!!
val preferredName = if (keepParentNameForColumns) "${it.name()}.${it.parentName}" else it.name()
val preferredName = if (keepParentNameForColumns) "${it.parentName}${separator}${it.name()}" else it.name()
val name = nameGen.addUnique(preferredName)
targetPath + name
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,17 @@ class FlattenTests {

aggregate
.flatten(keepParentNameForColumns = true)
.columnNames() shouldBe listOf("city", "age.mean", "weight.mean", "age.std", "weight.std")
.columnNames() shouldBe listOf("city", "mean.age", "mean.weight", "std.age", "std.weight")

aggregate
.flatten(keepParentNameForColumns = true, separator = "_happy_separator_")
.columnNames() shouldBe listOf(
"city",
"mean_happy_separator_age",
"mean_happy_separator_weight",
"std_happy_separator_age",
"std_happy_separator_weight"
)
}

@DataSchema
Expand Down

0 comments on commit 41577df

Please sign in to comment.