diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index 01c5cd3bd..fe2ebc2af 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -6,6 +6,7 @@ import kotlinx.datetime.toKotlinLocalDateTime import org.apache.poi.hssf.usermodel.HSSFWorkbook import org.apache.poi.ss.usermodel.Cell import org.apache.poi.ss.usermodel.CellType +import org.apache.poi.ss.usermodel.DataFormatter import org.apache.poi.ss.usermodel.DateUtil import org.apache.poi.ss.usermodel.RichTextString import org.apache.poi.ss.usermodel.Row @@ -83,6 +84,8 @@ private fun setWorkbookTempDirectory() { /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -93,17 +96,22 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(url.openStream()) - return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -114,17 +122,22 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(file) - return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -135,13 +148,17 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, -): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy) +): AnyFrame = + readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy) /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -152,17 +169,23 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(inputStream) - return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formattingOptions range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" + * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue]. * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -173,18 +196,39 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + formattingOptions: FormattingOptions? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { val sheet: Sheet = sheetName ?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") } ?: wb.getSheetAt(0) - return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy) + return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy) +} + +/** + * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + */ +@JvmInline +public value class StringColumns(public val range: String) + +public fun StringColumns.toFormattingOptions(formatter: DataFormatter = DataFormatter()): FormattingOptions = + FormattingOptions(range, formatter) + +/** + * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formatter + */ +public class FormattingOptions(range: String, public val formatter: DataFormatter = DataFormatter()) { + public val columnIndices: Set = getColumnIndices(range).toSet() } /** * @param sheet sheet to read. * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formattingOptions range of columns to read as String regardless of a cell's type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" + * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue]. * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -193,19 +237,13 @@ public fun DataFrame.Companion.readExcel( public fun DataFrame.Companion.readExcel( sheet: Sheet, columns: String? = null, + formattingOptions: FormattingOptions? = null, skipRows: Int = 0, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { val columnIndexes: Iterable = if (columns != null) { - columns.split(",").flatMap { - if (it.contains(":")) { - val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) } - start..end - } else { - listOf(CellReference.convertColStringToIndex(it)) - } - } + getColumnIndices(columns) } else { val headerRow = checkNotNull(sheet.getRow(skipRows)) { "Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}" @@ -235,17 +273,32 @@ public fun DataFrame.Companion.readExcel( val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy) columnNameCounters[nameFromCell] = columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name + val getCellValue: (Cell?) -> Any? = when { + formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? -> + formattingOptions.formatter.formatCellValue(cell) + } + else -> { cell -> cell.cellValue(sheet.sheetName) } + } val values: List = valueRowsRange.map { val row: Row? = sheet.getRow(it) val cell: Cell? = row?.getCell(index) - cell.cellValue(sheet.sheetName) + getCellValue(cell) } DataColumn.createWithTypeInference(name, values) } return dataFrameOf(columns) } +private fun getColumnIndices(columns: String): List = columns.split(",").flatMap { + if (it.contains(":")) { + val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) } + start..end + } else { + listOf(CellReference.convertColStringToIndex(it)) + } +} + /** * This is a universal function for name repairing * and should be moved to the API module later, @@ -324,7 +377,7 @@ public fun DataFrame.writeExcel( keepFile: Boolean = false, ) { val factory = - if (keepFile){ + if (keepFile) { when (workBookType) { WorkBookType.XLS -> HSSFWorkbook(file.inputStream()) WorkBookType.XLSX -> XSSFWorkbook(file.inputStream()) diff --git a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt index e6a05a087..fd8c6835f 100644 --- a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt +++ b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt @@ -45,6 +45,17 @@ class XlsxTest { df shouldBe dataFrameOf("col1", "col2", "C")(1.0, null, 3.0) } + @Test + fun `column with empty header and with formatting`() { + val df = DataFrame.readExcel( + testResource("sample2.xlsx"), + "Sheet1", + columns = "A:C", + stringColumns = StringColumns("A:C") + ) + df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3") + } + @Test fun `limit row number`() { val df = DataFrame.readExcel(testResource("sample4.xls"), "Sheet1", rowsCount = 5) @@ -179,4 +190,14 @@ class XlsxTest { val df = DataFrame.readExcel(testResource("formula_cell.xlsx")) df.columnNames() shouldBe listOf("Number", "Greater than 5", "Multiplied by 10", "Divided by 5") } + + @Test + fun `read mixed column`() { + val df = DataFrame.readExcel( + testResource("mixed_column.xlsx"), + stringColumns = StringColumns("A") + ) + df["col1"].type() shouldBe typeOf() + df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100") + } } diff --git a/dataframe-excel/src/test/resources/mixed_column.xlsx b/dataframe-excel/src/test/resources/mixed_column.xlsx new file mode 100644 index 000000000..0d766970f Binary files /dev/null and b/dataframe-excel/src/test/resources/mixed_column.xlsx differ diff --git a/docs/StardustDocs/topics/read.md b/docs/StardustDocs/topics/read.md index 91f883fe9..3550da482 100644 --- a/docs/StardustDocs/topics/read.md +++ b/docs/StardustDocs/topics/read.md @@ -401,7 +401,7 @@ Sometimes cells can have the wrong format in an Excel file. For example, you exp ```text IDS -100 <-- Intended to be String, but has wrong cell format in original .xlsx file +100 <-- Intended to be String, but has numeric cell format in original .xlsx file A100 B100 C100 @@ -409,20 +409,12 @@ C100 You will get column of `Serializable` instead (common parent for `Double` and `String`). -You can fix it using the `.convert()` function: +You can fix it by providing an additional parameter: ```kotlin -val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100") -val df1 = df.convert("IDS").with(Infer.Type) { - if (it is Double) { - it.toLong().toString() - } else { - it - } -} -df1["IDS"].type() shouldBe typeOf() +val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A")) ``` diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt index 371a9e566..8fd02b5b7 100644 --- a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt +++ b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt @@ -3,20 +3,19 @@ package org.jetbrains.kotlinx.dataframe.samples.api import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow -import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.columnNames import org.jetbrains.kotlinx.dataframe.api.columnTypes -import org.jetbrains.kotlinx.dataframe.api.convert -import org.jetbrains.kotlinx.dataframe.api.dataFrameOf -import org.jetbrains.kotlinx.dataframe.api.with import org.jetbrains.kotlinx.dataframe.io.ColType +import org.jetbrains.kotlinx.dataframe.io.StringColumns import org.jetbrains.kotlinx.dataframe.io.readArrowFeather import org.jetbrains.kotlinx.dataframe.io.readCSV +import org.jetbrains.kotlinx.dataframe.io.readExcel import org.jetbrains.kotlinx.dataframe.io.readJson import org.jetbrains.kotlinx.dataframe.testArrowFeather import org.jetbrains.kotlinx.dataframe.testCsv import org.jetbrains.kotlinx.dataframe.testJson +import org.junit.Ignore import org.junit.Test import java.util.* import kotlin.reflect.typeOf @@ -63,17 +62,10 @@ class Read { } @Test + @Ignore fun fixMixedColumn() { // SampleStart - val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100") - val df1 = df.convert("IDS").with(Infer.Type) { - if (it is Double) { - it.toLong().toString() - } else { - it - } - } - df1["IDS"].type() shouldBe typeOf() + val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A")) // SampleEnd }