diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 6ca02e2f4..9f950415b 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -9,6 +9,7 @@ import kotlinx.datetime.toKotlinLocalDate import kotlinx.datetime.toKotlinLocalDateTime import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.api.convertTo @@ -128,6 +129,17 @@ class ParserTests { ) } + @Test + fun `custom nullStrings`() { + val col by columnOf("1", "2", "null", "3", "NA", "nothing", "4.0", "5.0") + + val parsed = col.tryParse( + ParserOptions(nullStrings = setOf("null", "NA", "nothing")), + ) + parsed.type() shouldBe typeOf() + parsed.toList() shouldBe listOf(1, 2, null, 3, null, null, 4.0, 5.0) + } + @Test fun `converting String to Double in different locales`() { val currentLocale = Locale.getDefault() diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DelimParams.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DelimParams.kt index 0bb4426ae..b13dde21f 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DelimParams.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/DelimParams.kt @@ -7,7 +7,7 @@ import org.jetbrains.kotlinx.dataframe.io.ColType import org.jetbrains.kotlinx.dataframe.io.Compression import org.jetbrains.kotlinx.dataframe.io.DEFAULT_COL_TYPE import org.jetbrains.kotlinx.dataframe.io.QuoteMode -import org.jetbrains.kotlinx.dataframe.io.defaultDelimParserOptions +import org.jetbrains.kotlinx.dataframe.io.DEFAULT_PARSER_OPTIONS /** * Contains both the default values of csv/tsv parameters and the parameter KDocs. @@ -78,12 +78,12 @@ internal object DelimParams { /** * @param parserOptions Optional [parsing options][ParserOptions] for columns initially read as [String]. * Can configure locale, date format, double parsing, skipping types, etc. - * Default, [defaultDelimParserOptions]: + * Default, [DEFAULT_PARSER_OPTIONS]: * ``` * ParserOptions(nullStrings = ["", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil"]) * ``` */ - val PARSER_OPTIONS: ParserOptions = defaultDelimParserOptions + val PARSER_OPTIONS: ParserOptions = DEFAULT_PARSER_OPTIONS /** * @param ignoreEmptyLines If `true`, intermediate empty lines will be skipped. diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index a9ee7cbea..cd8b9422b 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -36,7 +36,7 @@ import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator import org.jetbrains.kotlinx.dataframe.io.ColType import org.jetbrains.kotlinx.dataframe.io.Compression import org.jetbrains.kotlinx.dataframe.io.DEFAULT_COL_TYPE -import org.jetbrains.kotlinx.dataframe.io.defaultNullStrings +import org.jetbrains.kotlinx.dataframe.io.DEFAULT_NULL_STRINGS import java.io.InputStream import java.math.BigDecimal import java.net.URL @@ -86,7 +86,7 @@ internal fun readDelimImpl( val csvSpecs = with(CsvSpecs.builder()) { if (additionalCsvSpecs != null) from(additionalCsvSpecs) customDoubleParser(FastDoubleParser(parserOptions)) - nullValueLiterals(parserOptions.nullStrings ?: defaultNullStrings) + nullValueLiterals(parserOptions.nullStrings ?: DEFAULT_NULL_STRINGS) headerLegalizer(::legalizeHeader) numRows(readLines ?: Long.MAX_VALUE) ignoreEmptyLines(ignoreEmptyLines) diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/util.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/util.kt index d64dacdd1..3bf9e7be8 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/util.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/util.kt @@ -11,8 +11,8 @@ public const val DEFAULT_COL_TYPE: String = ".default" /** * Default strings that are considered null. */ -public val defaultNullStrings: Set = +public val DEFAULT_NULL_STRINGS: Set = setOf("", "NA", "N/A", "null", "NULL", "None", "none", "NIL", "nil") -public val defaultDelimParserOptions: ParserOptions = - ParserOptions(nullStrings = defaultNullStrings) +public val DEFAULT_PARSER_OPTIONS: ParserOptions = + ParserOptions(nullStrings = DEFAULT_NULL_STRINGS) diff --git a/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DelimCsvTsvTests.kt b/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DelimCsvTsvTests.kt index 70b2e6a57..e4254b2dd 100644 --- a/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DelimCsvTsvTests.kt +++ b/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/DelimCsvTsvTests.kt @@ -529,24 +529,27 @@ class DelimCsvTsvTests { } @Test - fun `parse with wrong locales`() { - @Language("csv") - val csv = - """ - name; price; - a;12,45; - b;-13,35; - c;100.123,35; - d;-204.235,23; - """.trimIndent() - - val df = DataFrame.readCsvStr( - text = csv, - delimiter = ';', + fun `NA and custom null string in double column`() { + val df = DataFrame.readCsv( + msleepCsv, parserOptions = ParserOptions( - locale = Locale.GERMAN, + nullStrings = DEFAULT_NULL_STRINGS + "nothing", ), ) + + df.print(columnTypes = true, borders = true, title = true) + + df["name"].type() shouldBe typeOf() + df["genus"].type() shouldBe typeOf() + df["vore"].type() shouldBe typeOf() + df["order"].type() shouldBe typeOf() + df["conservation"].type() shouldBe typeOf() + df["sleep_total"].type() shouldBe typeOf() + df["sleep_rem"].type() shouldBe typeOf() + df["sleep_cycle"].type() shouldBe typeOf() + df["awake"].type() shouldBe typeOf() + df["brainwt"].type() shouldBe typeOf() + df["bodywt"].type() shouldBe typeOf() } companion object { @@ -558,6 +561,7 @@ class DelimCsvTsvTests { private val wineCsv = testCsv("wine") private val durationCsv = testCsv("duration") private val withBomCsv = testCsv("with-bom") + private val msleepCsv = testCsv("msleep") } } diff --git a/dataframe-csv/src/test/resources/msleep.csv b/dataframe-csv/src/test/resources/msleep.csv new file mode 100644 index 000000000..57dcd2c05 --- /dev/null +++ b/dataframe-csv/src/test/resources/msleep.csv @@ -0,0 +1,84 @@ +name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt +Cheetah,Acinonyx,carni,Carnivora,lc,12.1,nothing,NA,11.9,NA,50 +Owl monkey,Aotus,omni,Primates,NA,17,1.8,NA,7,0.0155,0.48 +Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,NA,9.6,NA,1.35 +Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333333,9.1,0.00029,0.019 +Cow,Bos,herbi,Artiodactyla,domesticated,4,0.7,0.666666667,20,0.423,600 +Three-toed sloth,Bradypus,herbi,Pilosa,NA,14.4,2.2,0.766666667,9.6,NA,3.85 +Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7,1.4,0.383333333,15.3,NA,20.49 +Vesper mouse,Calomys,NA,Rodentia,NA,7,NA,NA,17,NA,0.045 +Dog,Canis,carni,Carnivora,domesticated,10.1,2.9,0.333333333,13.9,0.07,14 +Roe deer,Capreolus,herbi,Artiodactyla,lc,3,NA,NA,21,0.0982,14.8 +Goat,Capri,herbi,Artiodactyla,lc,5.3,0.6,NA,18.7,0.115,33.5 +Guinea pig,Cavis,herbi,Rodentia,domesticated,9.4,0.8,0.216666667,14.6,0.0055,0.728 +Grivet,Cercopithecus,omni,Primates,lc,10,0.7,NA,14,NA,4.75 +Chinchilla,Chinchilla,herbi,Rodentia,domesticated,12.5,1.5,0.116666667,11.5,0.0064,0.42 +Star-nosed mole,Condylura,omni,Soricomorpha,lc,10.3,2.2,NA,13.7,0.001,0.06 +African giant pouched rat,Cricetomys,omni,Rodentia,NA,8.3,2,NA,15.7,0.0066,1 +Lesser short-tailed shrew,Cryptotis,omni,Soricomorpha,lc,9.1,1.4,0.15,14.9,0.00014,0.005 +Long-nosed armadillo,Dasypus,carni,Cingulata,lc,17.4,3.1,0.383333333,6.6,0.0108,3.5 +Tree hyrax,Dendrohyrax,herbi,Hyracoidea,lc,5.3,0.5,NA,18.7,0.0123,2.95 +North American Opossum,Didelphis,omni,Didelphimorphia,lc,18,4.9,0.333333333,6,0.0063,1.7 +Asian elephant,Elephas,herbi,Proboscidea,en,3.9,NA,NA,20.1,4.603,2547 +Big brown bat,Eptesicus,insecti,Chiroptera,lc,19.7,3.9,0.116666667,4.3,3e-04,0.023 +Horse,Equus,herbi,Perissodactyla,domesticated,2.9,0.6,1,21.1,0.655,521 +Donkey,Equus,herbi,Perissodactyla,domesticated,3.1,0.4,NA,20.9,0.419,187 +European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333333,13.9,0.0035,0.77 +Patas monkey,Erythrocebus,omni,Primates,lc,10.9,1.1,NA,13.1,0.115,10 +Western american chipmunk,Eutamias,herbi,Rodentia,NA,14.9,NA,NA,9.1,NA,0.071 +Domestic cat,Felis,carni,Carnivora,domesticated,12.5,3.2,0.416666667,11.5,0.0256,3.3 +Galago,Galago,omni,Primates,NA,9.8,1.1,0.55,14.2,0.005,0.2 +Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,NA,22.1,NA,899.995 +Pilot whale,Globicephalus,carni,Cetacea,cd,2.7,0.1,NA,21.35,NA,800 +Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,NA,17.8,0.325,85 +Gray hyrax,Heterohyrax,herbi,Hyracoidea,lc,6.3,0.6,NA,17.7,0.01227,2.625 +Human,Homo,omni,Primates,NA,8,1.9,1.5,16,1.32,62 +Mongoose lemur,Lemur,herbi,Primates,vu,9.5,0.9,NA,14.5,NA,1.67 +African elephant,Loxodonta,herbi,Proboscidea,vu,3.3,NA,NA,20.7,5.712,6654 +Thick-tailed opposum,Lutreolina,carni,Didelphimorphia,lc,19.4,6.6,NA,4.6,NA,0.37 +Macaque,Macaca,omni,Primates,NA,10.1,1.2,0.75,13.9,0.179,6.8 +Mongolian gerbil,Meriones,herbi,Rodentia,lc,14.2,1.9,NA,9.8,NA,0.053 +Golden hamster,Mesocricetus,herbi,Rodentia,en,14.3,3.1,0.2,9.7,0.001,0.12 +Vole ,Microtus,herbi,Rodentia,NA,12.8,NA,NA,11.2,NA,0.035 +House mouse,Mus,herbi,Rodentia,nt,12.5,1.4,0.183333333,11.5,4e-04,0.022 +Little brown bat,Myotis,insecti,Chiroptera,NA,19.9,2,0.2,4.1,0.00025,0.01 +Round-tailed muskrat,Neofiber,herbi,Rodentia,nt,14.6,NA,NA,9.4,NA,0.266 +Slow loris,Nyctibeus,carni,Primates,NA,11,NA,NA,13,0.0125,1.4 +Degu,Octodon,herbi,Rodentia,lc,7.7,0.9,NA,16.3,NA,0.21 +Northern grasshopper mouse,Onychomys,carni,Rodentia,lc,14.5,NA,NA,9.5,NA,0.028 +Rabbit,Oryctolagus,herbi,Lagomorpha,domesticated,8.4,0.9,0.416666667,15.6,0.0121,2.5 +Sheep,Ovis,herbi,Artiodactyla,domesticated,3.8,0.6,NA,20.2,0.175,55.5 +Chimpanzee,Pan,omni,Primates,NA,9.7,1.4,1.416666667,14.3,0.44,52.2 +Tiger,Panthera,carni,Carnivora,en,15.8,NA,NA,8.2,NA,162.564 +Jaguar,Panthera,carni,Carnivora,nt,10.4,NA,NA,13.6,0.157,100 +Lion,Panthera,carni,Carnivora,vu,13.5,NA,NA,10.5,NA,161.499 +Baboon,Papio,omni,Primates,NA,9.4,1,0.666666667,14.6,0.18,25.235 +Desert hedgehog,Paraechinus,NA,Erinaceomorpha,lc,10.3,2.7,NA,13.7,0.0024,0.55 +Potto,Perodicticus,omni,Primates,lc,11,NA,NA,13,NA,1.1 +Deer mouse,Peromyscus,NA,Rodentia,NA,11.5,NA,NA,12.5,NA,0.021 +Phalanger,Phalanger,NA,Diprotodontia,NA,13.7,1.8,NA,10.3,0.0114,1.62 +Caspian seal,Phoca,carni,Carnivora,vu,3.5,0.4,NA,20.5,NA,86 +Common porpoise,Phocoena,carni,Cetacea,vu,5.6,NA,NA,18.45,NA,53.18 +Potoroo,Potorous,herbi,Diprotodontia,NA,11.1,1.5,NA,12.9,NA,1.1 +Giant armadillo,Priodontes,insecti,Cingulata,en,18.1,6.1,NA,5.9,0.081,60 +Rock hyrax,Procavia,NA,Hyracoidea,lc,5.4,0.5,NA,18.6,0.021,3.6 +Laboratory rat,Rattus,herbi,Rodentia,lc,13,2.4,0.183333333,11,0.0019,0.32 +African striped mouse,Rhabdomys,omni,Rodentia,NA,8.7,NA,NA,15.3,NA,0.044 +Squirrel monkey,Saimiri,omni,Primates,NA,9.6,1.4,NA,14.4,0.02,0.743 +Eastern american mole,Scalopus,insecti,Soricomorpha,lc,8.4,2.1,0.166666667,15.6,0.0012,0.075 +Cotton rat,Sigmodon,herbi,Rodentia,NA,11.3,1.1,0.15,12.7,0.00118,0.148 +Mole rat,Spalax,NA,Rodentia,NA,10.6,2.4,NA,13.4,0.003,0.122 +Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,NA,NA,7.4,0.0057,0.92 +Thirteen-lined ground squirrel,Spermophilus,herbi,Rodentia,lc,13.8,3.4,0.216666667,10.2,0.004,0.101 +Golden-mantled ground squirrel,Spermophilus,herbi,Rodentia,lc,15.9,3,NA,8.1,NA,0.205 +Musk shrew,Suncus,NA,Soricomorpha,NA,12.8,2,0.183333333,11.2,0.00033,0.048 +Pig,Sus,omni,Artiodactyla,domesticated,9.1,2.4,0.5,14.9,0.18,86.25 +Short-nosed echidna,Tachyglossus,insecti,Monotremata,NA,8.6,NA,NA,15.4,0.025,4.5 +Eastern american chipmunk,Tamias,herbi,Rodentia,NA,15.8,NA,NA,8.2,NA,0.112 +Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1,0.9,19.6,0.169,207.501 +Tenrec,Tenrec,omni,Afrosoricida,NA,15.6,2.3,NA,8.4,0.0026,0.9 +Tree shrew,Tupaia,omni,Scandentia,NA,8.9,2.6,0.233333333,15.1,0.0025,0.104 +Bottle-nosed dolphin,Tursiops,carni,Cetacea,NA,5.2,NA,NA,18.8,NA,173.33 +Genet,Genetta,carni,Carnivora,NA,6.3,1.3,NA,17.7,0.0175,2 +Arctic fox,Vulpes,carni,Carnivora,NA,12.5,NA,NA,11.5,0.0445,3.38 +Red fox,Vulpes,carni,Carnivora,NA,9.8,2.4,0.35,14.2,0.0504,4.23