Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to read Excel cell values as a String regardless of their content type #745

Merged
merged 1 commit into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import kotlinx.datetime.toKotlinLocalDateTime
import org.apache.poi.hssf.usermodel.HSSFWorkbook
import org.apache.poi.ss.usermodel.Cell
import org.apache.poi.ss.usermodel.CellType
import org.apache.poi.ss.usermodel.DataFormatter
import org.apache.poi.ss.usermodel.DateUtil
import org.apache.poi.ss.usermodel.RichTextString
import org.apache.poi.ss.usermodel.Row
Expand Down Expand Up @@ -83,6 +84,8 @@ private fun setWorkbookTempDirectory() {
/**
* @param sheetName sheet to read. By default, the first sheet in the document
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param stringColumns range of columns to read as String regardless of a cell type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd surround types with [] to make them clickable :). *their type or *a cell's type

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What types? String and Double?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that's what I do to all types across the kdocs :) I think it's nice for interactivity and consistency.

* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -93,17 +96,22 @@ public fun DataFrame.Companion.readExcel(
sheetName: String? = null,
skipRows: Int = 0,
columns: String? = null,
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(url.openStream())
return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
return wb.use {
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
}
}

/**
* @param sheetName sheet to read. By default, the first sheet in the document
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param stringColumns range of columns to read as String regardless of a cell type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -114,17 +122,22 @@ public fun DataFrame.Companion.readExcel(
sheetName: String? = null,
skipRows: Int = 0,
columns: String? = null,
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(file)
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
return wb.use {
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
}
}

/**
* @param sheetName sheet to read. By default, the first sheet in the document
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param stringColumns range of columns to read as String regardless of a cell type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -135,13 +148,17 @@ public fun DataFrame.Companion.readExcel(
sheetName: String? = null,
skipRows: Int = 0,
columns: String? = null,
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy)
): AnyFrame =
readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy)

/**
* @param sheetName sheet to read. By default, the first sheet in the document
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param stringColumns range of columns to read as String regardless of a cell type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -152,17 +169,23 @@ public fun DataFrame.Companion.readExcel(
sheetName: String? = null,
skipRows: Int = 0,
columns: String? = null,
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(inputStream)
return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) }
return wb.use {
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
}
}

/**
* @param sheetName sheet to read. By default, the first sheet in the document
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param formattingOptions range of columns to read as String regardless of a cell type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
* See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -173,18 +196,39 @@ public fun DataFrame.Companion.readExcel(
sheetName: String? = null,
skipRows: Int = 0,
columns: String? = null,
formattingOptions: FormattingOptions? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame {
val sheet: Sheet = sheetName
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
?: wb.getSheetAt(0)
return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy)
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy)
}

/**
* @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
*/
@JvmInline
public value class StringColumns(public val range: String)

public fun StringColumns.toFormattingOptions(formatter: DataFormatter = DataFormatter()): FormattingOptions =
FormattingOptions(range, formatter)

/**
* @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
koperagen marked this conversation as resolved.
Show resolved Hide resolved
* @param formatter
*/
public class FormattingOptions(range: String, public val formatter: DataFormatter = DataFormatter()) {
koperagen marked this conversation as resolved.
Show resolved Hide resolved
public val columnIndices: Set<Int> = getColumnIndices(range).toSet()
}

/**
* @param sheet sheet to read.
* @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”)
* @param formattingOptions range of columns to read as String regardless of a cell's type.
* For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3"
* See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue].
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
Expand All @@ -193,19 +237,13 @@ public fun DataFrame.Companion.readExcel(
public fun DataFrame.Companion.readExcel(
sheet: Sheet,
columns: String? = null,
formattingOptions: FormattingOptions? = null,
skipRows: Int = 0,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame {
val columnIndexes: Iterable<Int> = if (columns != null) {
columns.split(",").flatMap {
if (it.contains(":")) {
val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
start..end
} else {
listOf(CellReference.convertColStringToIndex(it))
}
}
getColumnIndices(columns)
} else {
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
Expand Down Expand Up @@ -235,17 +273,32 @@ public fun DataFrame.Companion.readExcel(
val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
columnNameCounters[nameFromCell] =
columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
val getCellValue: (Cell?) -> Any? = when {
formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? ->
formattingOptions.formatter.formatCellValue(cell)
}

else -> { cell -> cell.cellValue(sheet.sheetName) }
}
val values: List<Any?> = valueRowsRange.map {
val row: Row? = sheet.getRow(it)
val cell: Cell? = row?.getCell(index)
cell.cellValue(sheet.sheetName)
getCellValue(cell)
}
DataColumn.createWithTypeInference(name, values)
}
return dataFrameOf(columns)
}

private fun getColumnIndices(columns: String): List<Int> = columns.split(",").flatMap {
if (it.contains(":")) {
val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) }
start..end
} else {
listOf(CellReference.convertColStringToIndex(it))
}
}

/**
* This is a universal function for name repairing
* and should be moved to the API module later,
Expand Down Expand Up @@ -324,7 +377,7 @@ public fun <T> DataFrame<T>.writeExcel(
keepFile: Boolean = false,
) {
val factory =
if (keepFile){
if (keepFile) {
when (workBookType) {
WorkBookType.XLS -> HSSFWorkbook(file.inputStream())
WorkBookType.XLSX -> XSSFWorkbook(file.inputStream())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ class XlsxTest {
df shouldBe dataFrameOf("col1", "col2", "C")(1.0, null, 3.0)
}

@Test
fun `column with empty header and with formatting`() {
val df = DataFrame.readExcel(
testResource("sample2.xlsx"),
"Sheet1",
columns = "A:C",
stringColumns = StringColumns("A:C")
)
df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3")
}

@Test
fun `limit row number`() {
val df = DataFrame.readExcel(testResource("sample4.xls"), "Sheet1", rowsCount = 5)
Expand Down Expand Up @@ -179,4 +190,14 @@ class XlsxTest {
val df = DataFrame.readExcel(testResource("formula_cell.xlsx"))
df.columnNames() shouldBe listOf("Number", "Greater than 5", "Multiplied by 10", "Divided by 5")
}

@Test
fun `read mixed column`() {
val df = DataFrame.readExcel(
testResource("mixed_column.xlsx"),
stringColumns = StringColumns("A")
)
df["col1"].type() shouldBe typeOf<String>()
df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100")
}
}
Binary file not shown.
14 changes: 3 additions & 11 deletions docs/StardustDocs/topics/read.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,28 +401,20 @@ Sometimes cells can have the wrong format in an Excel file. For example, you exp

```text
IDS
100 <-- Intended to be String, but has wrong cell format in original .xlsx file
100 <-- Intended to be String, but has numeric cell format in original .xlsx file
A100
B100
C100
```

You will get column of `Serializable` instead (common parent for `Double` and `String`).

You can fix it using the `.convert()` function:
You can fix it by providing an additional parameter:

<!---FUN fixMixedColumn-->

```kotlin
val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
val df1 = df.convert("IDS").with(Infer.Type) {
if (it is Double) {
it.toLong().toString()
} else {
it
}
}
df1["IDS"].type() shouldBe typeOf<String>()
val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
```

<!---END-->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,19 @@ package org.jetbrains.kotlinx.dataframe.samples.api
import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.api.Infer
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
import org.jetbrains.kotlinx.dataframe.api.columnNames
import org.jetbrains.kotlinx.dataframe.api.columnTypes
import org.jetbrains.kotlinx.dataframe.api.convert
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.io.ColType
import org.jetbrains.kotlinx.dataframe.io.StringColumns
import org.jetbrains.kotlinx.dataframe.io.readArrowFeather
import org.jetbrains.kotlinx.dataframe.io.readCSV
import org.jetbrains.kotlinx.dataframe.io.readExcel
import org.jetbrains.kotlinx.dataframe.io.readJson
import org.jetbrains.kotlinx.dataframe.testArrowFeather
import org.jetbrains.kotlinx.dataframe.testCsv
import org.jetbrains.kotlinx.dataframe.testJson
import org.junit.Ignore
import org.junit.Test
import java.util.*
import kotlin.reflect.typeOf
Expand Down Expand Up @@ -63,17 +62,10 @@ class Read {
}

@Test
@Ignore
fun fixMixedColumn() {
// SampleStart
val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100")
val df1 = df.convert("IDS").with(Infer.Type) {
if (it is Double) {
it.toLong().toString()
} else {
it
}
}
df1["IDS"].type() shouldBe typeOf<String>()
val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
// SampleEnd
}

Expand Down
Loading