From c980e8192594189af10b98bd72d3b745650ef409 Mon Sep 17 00:00:00 2001 From: Nikita Klimenko Date: Wed, 19 Jun 2024 17:06:24 +0300 Subject: [PATCH] Add an option to read Excel cell values as a String regardless of their content type fixes #669 --- .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 83 ++++++++++++++---- .../kotlinx/dataframe/io/XlsxTest.kt | 21 +++++ .../src/test/resources/mixed_column.xlsx | Bin 0 -> 4845 bytes docs/StardustDocs/topics/read.md | 14 +-- .../kotlinx/dataframe/samples/api/Read.kt | 18 ++-- 5 files changed, 97 insertions(+), 39 deletions(-) create mode 100644 dataframe-excel/src/test/resources/mixed_column.xlsx diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index 01c5cd3bd..fe2ebc2af 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -6,6 +6,7 @@ import kotlinx.datetime.toKotlinLocalDateTime import org.apache.poi.hssf.usermodel.HSSFWorkbook import org.apache.poi.ss.usermodel.Cell import org.apache.poi.ss.usermodel.CellType +import org.apache.poi.ss.usermodel.DataFormatter import org.apache.poi.ss.usermodel.DateUtil import org.apache.poi.ss.usermodel.RichTextString import org.apache.poi.ss.usermodel.Row @@ -83,6 +84,8 @@ private fun setWorkbookTempDirectory() { /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -93,17 +96,22 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(url.openStream()) - return wb.use { readExcel(wb, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -114,17 +122,22 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(file) - return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -135,13 +148,17 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, -): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, rowsCount, nameRepairStrategy) +): AnyFrame = + readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy) /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param stringColumns range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -152,17 +169,23 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + stringColumns: StringColumns? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { setWorkbookTempDirectory() val wb = WorkbookFactory.create(inputStream) - return wb.use { readExcel(it, sheetName, skipRows, columns, rowsCount, nameRepairStrategy) } + return wb.use { + readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy) + } } /** * @param sheetName sheet to read. By default, the first sheet in the document * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formattingOptions range of columns to read as String regardless of a cell type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" + * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue]. * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -173,18 +196,39 @@ public fun DataFrame.Companion.readExcel( sheetName: String? = null, skipRows: Int = 0, columns: String? = null, + formattingOptions: FormattingOptions? = null, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { val sheet: Sheet = sheetName ?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") } ?: wb.getSheetAt(0) - return readExcel(sheet, columns, skipRows, rowsCount, nameRepairStrategy) + return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy) +} + +/** + * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + */ +@JvmInline +public value class StringColumns(public val range: String) + +public fun StringColumns.toFormattingOptions(formatter: DataFormatter = DataFormatter()): FormattingOptions = + FormattingOptions(range, formatter) + +/** + * @param range comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formatter + */ +public class FormattingOptions(range: String, public val formatter: DataFormatter = DataFormatter()) { + public val columnIndices: Set = getColumnIndices(range).toSet() } /** * @param sheet sheet to read. * @param columns comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”) + * @param formattingOptions range of columns to read as String regardless of a cell's type. + * For example, by default numeric cell with value "3" will be parsed as Double with value being 3.0. With this option, it will be simply "3" + * See also [FormattingOptions.formatter] and [DataFormatter.formatCellValue]. * @param skipRows number of rows before header * @param rowsCount number of rows to read. * @param nameRepairStrategy handling of column names. @@ -193,19 +237,13 @@ public fun DataFrame.Companion.readExcel( public fun DataFrame.Companion.readExcel( sheet: Sheet, columns: String? = null, + formattingOptions: FormattingOptions? = null, skipRows: Int = 0, rowsCount: Int? = null, nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, ): AnyFrame { val columnIndexes: Iterable = if (columns != null) { - columns.split(",").flatMap { - if (it.contains(":")) { - val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) } - start..end - } else { - listOf(CellReference.convertColStringToIndex(it)) - } - } + getColumnIndices(columns) } else { val headerRow = checkNotNull(sheet.getRow(skipRows)) { "Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}" @@ -235,17 +273,32 @@ public fun DataFrame.Companion.readExcel( val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy) columnNameCounters[nameFromCell] = columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name + val getCellValue: (Cell?) -> Any? = when { + formattingOptions != null && index in formattingOptions.columnIndices -> { cell: Cell? -> + formattingOptions.formatter.formatCellValue(cell) + } + else -> { cell -> cell.cellValue(sheet.sheetName) } + } val values: List = valueRowsRange.map { val row: Row? = sheet.getRow(it) val cell: Cell? = row?.getCell(index) - cell.cellValue(sheet.sheetName) + getCellValue(cell) } DataColumn.createWithTypeInference(name, values) } return dataFrameOf(columns) } +private fun getColumnIndices(columns: String): List = columns.split(",").flatMap { + if (it.contains(":")) { + val (start, end) = it.split(":").map { CellReference.convertColStringToIndex(it) } + start..end + } else { + listOf(CellReference.convertColStringToIndex(it)) + } +} + /** * This is a universal function for name repairing * and should be moved to the API module later, @@ -324,7 +377,7 @@ public fun DataFrame.writeExcel( keepFile: Boolean = false, ) { val factory = - if (keepFile){ + if (keepFile) { when (workBookType) { WorkBookType.XLS -> HSSFWorkbook(file.inputStream()) WorkBookType.XLSX -> XSSFWorkbook(file.inputStream()) diff --git a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt index e6a05a087..fd8c6835f 100644 --- a/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt +++ b/dataframe-excel/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/XlsxTest.kt @@ -45,6 +45,17 @@ class XlsxTest { df shouldBe dataFrameOf("col1", "col2", "C")(1.0, null, 3.0) } + @Test + fun `column with empty header and with formatting`() { + val df = DataFrame.readExcel( + testResource("sample2.xlsx"), + "Sheet1", + columns = "A:C", + stringColumns = StringColumns("A:C") + ) + df shouldBe dataFrameOf("col1", "col2", "C")("1", "", "3") + } + @Test fun `limit row number`() { val df = DataFrame.readExcel(testResource("sample4.xls"), "Sheet1", rowsCount = 5) @@ -179,4 +190,14 @@ class XlsxTest { val df = DataFrame.readExcel(testResource("formula_cell.xlsx")) df.columnNames() shouldBe listOf("Number", "Greater than 5", "Multiplied by 10", "Divided by 5") } + + @Test + fun `read mixed column`() { + val df = DataFrame.readExcel( + testResource("mixed_column.xlsx"), + stringColumns = StringColumns("A") + ) + df["col1"].type() shouldBe typeOf() + df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100") + } } diff --git a/dataframe-excel/src/test/resources/mixed_column.xlsx b/dataframe-excel/src/test/resources/mixed_column.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0d766970ff51d01c76ef74ca1b13713528a74da4 GIT binary patch literal 4845 zcmaJ_1z42Z)&_o!tBxPu%hEAz} z=A8RqxZZnwzvubB`JU%nd(U2L?e|@4YbXOyiIFfdF_A7E#`KWx00{qW;sUmJ1@R#6 z*d;?Mkug!$hcq!a{y@N3gN(wOI6#Wh%!qVw; z_pFX1Gvy=YoJtKiqhl}0z4Nm1PPeF9>~tZFLpn-K_!a}D zGZtGiEud};x16)|4skFt0o+)wpG1_LK&_z{F*@O=At_ZQTXNwZE5;~tq9aSXxo@~n z+2lY1&4lV-Wwf5pOzp;Y8Oe|x1XGQ10@6(u+K>23yA{f zo6|fEpqX|XG#z^HpAB}JwLj8mj}aurHcA@M3ThYl72Wc4WR20xz?s9dE3d&ELQ&+p?HB{PJ_ z4BM1IQd|HL|9g+qF1VxMWj(*IAxF?y&Cqj9YiPebE~W-kt8 z(I6%wx9^5zx<1%zB@LGK%QC8KoL>IdfVdvX5l^OJF&Xyaq6pBF~5u?WK+orwC6VDp+rpiQ@XlMInFQLhDHr^rZ1j)33g z4DR1K%;7>E*$8@vFqBw+rj>wp~e?i|cIN5d$i#D#-#qO{xBmy884O zTsz<93w+lI294tXQH@(9Oj1A^7$hk0T;I?i8zu3XHi2utEytqb`&v5Eu+KOu$EEzf7&d4hi$1{LBV?)AJsfa*=BYHQB@ z%>@rIeoUr{+pnZ&3mTiaD)>B}{|5^zKx($V`BdpgRbAMZP5LAOO%BB~hsRxLi(yN& zKXc!$P^u8A0fC+AZ{OOLmjYPL%kK^AyM7-xZrp7@qFBA`g^e|tAD+-hR7ApjnP-vh z{eQGqKRo*bQ{Jfy_F9@S}A&IPf*NUL*Bf8$PHnq>8r4n|tF zU0-*>!Tp7LWuA;tS@$ITWA}%EB39K+v;f}-l{HxR-k^4Re|Ttm%{5=Q_Nfqo?lb^0 z$-hI*-WSX!S7SU^v@Z5dJQE$!{6Y)fxp&qJtM4-qL&iNXI%C>j^M|LC5Nlwg@2l(>AX5;}%Z-8|f;q$b*_d%WQdW9m1 z#iq34!B`R?zbO3TY!!4<0E&hctQh?=a4W7Jx zXOSou7HvKH=Nk>CrsjFk zmr~_{eB=WXdkm_odF(*WF!N6oZGM(T5TZHrFG7-Qae zDq-Oy`<%=IxS7J&C1`CV*nPI{`c(J#s$|YXLvk{{d0YbW^(=udxvA6c*R&k0MNA4@&m))>Hb!xRyJ1&RdoL1s zUrP=OVg3frG*{3g&sjD%JWv?!G}?YHgn!h%H+6gfYlz9Du-QJZue2{=r(FJ-is5VG~aP~v@h>@(0TEeSfF zdATojEXHwapn+DsHaj<)zQpklv%nc!mf|``X61JkBscG{X7O4AtbCl>nZ~VCQk$~z zdm)%;JQYF{?k7$B^HSeFD*--O9KK|C#Fhk=Dd(3+zEG?ps3>GCk}a4Rup@SkFVtoQ zKlYQa;j0wAv6!FxCL5ipo9DJ4xY|=xaC5zvCv&k8ANC1r$NGtc>CpHtepMb;IcL$l zFy)2+(6)a!F-*M#27@vn)=u((IPF;l9g9y$h}XOhP#ClChBhdc&H4q!P~=$ z!;^=t3dfs*F`E(H(m3d)xu~HXJ+ClZ0np`_?sw^gP@l5i(vw8b) zu^=wA@d6{7apnu0AbyM^hx66Ps}c+}Q;Y+Z!P%NZ9$4Q@tY~+=fKFJmQMQTTAshFgl* zrtKawFpTPG@1c4HjQ*i!mL97re{`bL5lZ#)ykA*d#%ZUTOn8v`9)-0PHfn;pLrzrS z*gmgO;KEX|yDx*_-jlGHEbBFfO^)ioiX<~`<_o;MWqrywn=6?vZUwzQ@sBLsxIH46 zPh2X-HRZ@KL(~~b06+r9(5;9I_4@SSZ@ZBX@P{VXyn^}W3Ch%c^T5>xt;6`A9Fl$l zcdkP(Kx|uN+eC;Z_hBf>SHl-14X9-*A-3A#~np(rl}bGLYmJ_tnU`96<+?t5Z(8b)lLD?;X*{ryuq&^+<=1lh)oU zXe^xfA$OEUM?I3vIPPqbj}_9ktSv@mT1TaXj5Vbyc5ieqklKG}TAS$BnT=5XF@EAQTDtq(d?z9Y-U^EQqWs9I~6kgSa!LEdOV47n-^3;^G){~T-E{PkyD|AGp5^1Mt z?BG$F{cUm*+>Ml(voj)4S#MOGIuV=og5+6uog%xT85{15AS|SoMy%HGEi;d$t$0#? z@#O{DaDagLbY=OUC(62cJ2j8ba>`0Zh{@OhjAhzZhIy51k4>#tJjgB z6<2%MNBwn=Iwi4=ziY}5R9f#f=!iaZ)CG!AuzE;jxeKT=>qPSjNhg>5&R27X(vZFu%nxawx=`L)#$HkNBg0A7cT+yf|H?= z?bVk60;QS2KQZ633GnYT^Tpi%I5&FjJ;bj442wM?qW|M=Yl`|Sq1j><{oupV^-PuZ zRHhstV}6CpLG34HP`|y{0I4BRyu%pgsu^xtNq#7c8|iJ!WnGrk%TDXO#bM=Hy+TrT zxPKNzB8M?dj~gGXQbWadUW(NZLyzUXyT4{t)B2-hvNbKjD`&=7ESz+{vntK0GSAXbAE_Mj=MJJ(;-Mnj=mo{u6i3 zDSkD)+n*si_ghfFr~0Lj|JC?zd5mb`ZlMZSOt|rFL-(ucU0Frc&$j@g|9js5s-}M( z<*o!FB={D1aJh#6fqxL>U&p!2>j=TR1smL(h=u$rU%!rUmv0dKe+w#{|DguIn%~`Z w2vWL558Sc;7L$Kb)UVceGXt?RZ*h(HKliDIG8){=NJv=lO9IZ2sejG+e@a0jn*aa+ literal 0 HcmV?d00001 diff --git a/docs/StardustDocs/topics/read.md b/docs/StardustDocs/topics/read.md index 91f883fe9..3550da482 100644 --- a/docs/StardustDocs/topics/read.md +++ b/docs/StardustDocs/topics/read.md @@ -401,7 +401,7 @@ Sometimes cells can have the wrong format in an Excel file. For example, you exp ```text IDS -100 <-- Intended to be String, but has wrong cell format in original .xlsx file +100 <-- Intended to be String, but has numeric cell format in original .xlsx file A100 B100 C100 @@ -409,20 +409,12 @@ C100 You will get column of `Serializable` instead (common parent for `Double` and `String`). -You can fix it using the `.convert()` function: +You can fix it by providing an additional parameter: ```kotlin -val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100") -val df1 = df.convert("IDS").with(Infer.Type) { - if (it is Double) { - it.toLong().toString() - } else { - it - } -} -df1["IDS"].type() shouldBe typeOf() +val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A")) ``` diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt index 371a9e566..8fd02b5b7 100644 --- a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt +++ b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Read.kt @@ -3,20 +3,19 @@ package org.jetbrains.kotlinx.dataframe.samples.api import io.kotest.matchers.shouldBe import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow -import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.columnNames import org.jetbrains.kotlinx.dataframe.api.columnTypes -import org.jetbrains.kotlinx.dataframe.api.convert -import org.jetbrains.kotlinx.dataframe.api.dataFrameOf -import org.jetbrains.kotlinx.dataframe.api.with import org.jetbrains.kotlinx.dataframe.io.ColType +import org.jetbrains.kotlinx.dataframe.io.StringColumns import org.jetbrains.kotlinx.dataframe.io.readArrowFeather import org.jetbrains.kotlinx.dataframe.io.readCSV +import org.jetbrains.kotlinx.dataframe.io.readExcel import org.jetbrains.kotlinx.dataframe.io.readJson import org.jetbrains.kotlinx.dataframe.testArrowFeather import org.jetbrains.kotlinx.dataframe.testCsv import org.jetbrains.kotlinx.dataframe.testJson +import org.junit.Ignore import org.junit.Test import java.util.* import kotlin.reflect.typeOf @@ -63,17 +62,10 @@ class Read { } @Test + @Ignore fun fixMixedColumn() { // SampleStart - val df = dataFrameOf("IDS")(100.0, "A100", "B100", "C100") - val df1 = df.convert("IDS").with(Infer.Type) { - if (it is Double) { - it.toLong().toString() - } else { - it - } - } - df1["IDS"].type() shouldBe typeOf() + val df = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A")) // SampleEnd }