From 2f51ebf83df135cb6391546c56b70fc6bd6ef52b Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 4 Oct 2024 17:56:31 +0200 Subject: [PATCH] supporting different compressions csv --- .../kotlinx/dataframe/impl/io/CsvTsvParams.kt | 16 ++-- .../kotlinx/dataframe/impl/io/ioUtils.kt | 61 ++++++++++++- .../kotlinx/dataframe/impl/io/readCsvOrTsv.kt | 53 ++++++----- .../dataframe/impl/io/writeCsvOrTsv.kt | 2 +- .../kotlinx/dataframe/io/CsvCompression.kt | 24 +++++ .../jetbrains/kotlinx/dataframe/io/readCsv.kt | 19 ++-- .../jetbrains/kotlinx/dataframe/io/readTsv.kt | 19 ++-- .../kotlinx/dataframe/io/writeCsv.kt | 8 +- .../kotlinx/dataframe/io/writeTsv.kt | 8 +- .../kotlinx/dataframe/io/CsvTsvTests.kt | 84 ++++++++++++++++-- .../src/test/resources/testCSV.csv.gz | Bin 0 -> 265 bytes dataframe-csv/src/test/resources/testCSV.zip | Bin 0 -> 427 bytes dataframe-csv/src/test/resources/two csvs.zip | Bin 0 -> 1451 bytes 13 files changed, 224 insertions(+), 70 deletions(-) create mode 100644 dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvCompression.kt create mode 100644 dataframe-csv/src/test/resources/testCSV.csv.gz create mode 100644 dataframe-csv/src/test/resources/testCSV.zip create mode 100644 dataframe-csv/src/test/resources/two csvs.zip diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/CsvTsvParams.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/CsvTsvParams.kt index 982344b6bd..d169ec7a00 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/CsvTsvParams.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/CsvTsvParams.kt @@ -4,6 +4,7 @@ import io.deephaven.csv.CsvSpecs import org.apache.commons.csv.CSVFormat import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.io.ColType +import org.jetbrains.kotlinx.dataframe.io.CsvCompression import org.jetbrains.kotlinx.dataframe.io.DEFAULT_COL_TYPE import org.jetbrains.kotlinx.dataframe.io.QuoteMode @@ -31,10 +32,11 @@ internal object CsvTsvParams { val HEADER: List = emptyList() /** - * @param isCompressed If `true`, the input stream is compressed and will be decompressed before reading. - * The default is `false`. + * @param compression Determines the compression of the CSV file. + * If a ZIP file contains multiple files, an [IllegalArgumentException] is thrown. + * The default is [CsvCompression.None]. */ - const val IS_COMPRESSED: Boolean = false + val COMPRESSION: CsvCompression<*> = CsvCompression.None /** * @param colTypes A map of column names to their expected [ColType]s. Can be supplied to force @@ -70,7 +72,7 @@ internal object CsvTsvParams { ) /** - * @param ignoreEmptyLines If `true`, empty lines will be skipped. + * @param ignoreEmptyLines If `true`, intermediate empty lines will be skipped. * The default is `false`. */ const val IGNORE_EMPTY_LINES: Boolean = false @@ -79,9 +81,9 @@ internal object CsvTsvParams { * @param allowMissingColumns If this set to `true`, then rows that are too short * (that have fewer columns than the header row) will be interpreted as if the missing columns contained * the empty string. - * The default is `false`. + * The default is `true`. */ - const val ALLOW_MISSING_COLUMNS: Boolean = false + const val ALLOW_MISSING_COLUMNS: Boolean = true /** * @param ignoreExcessColumns If this set to `true`, then rows that are too long @@ -158,7 +160,7 @@ internal object CsvTsvParams { * @param recordSeparator The character that separates records in a CSV/TSV file. * The default is `'\n'`. */ - const val RECORD_SEPARATOR: Char = '\n' + const val RECORD_SEPARATOR: String = "\n" /** * @param headerComments A list of comments to include at the beginning of the CSV/TSV file. diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/ioUtils.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/ioUtils.kt index dee013cc6f..abeba26ff4 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/ioUtils.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/ioUtils.kt @@ -1,19 +1,36 @@ package org.jetbrains.kotlinx.dataframe.impl.io +import org.apache.commons.io.input.BOMInputStream import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.CsvCompression +import org.jetbrains.kotlinx.dataframe.io.CsvCompression.Custom +import org.jetbrains.kotlinx.dataframe.io.CsvCompression.Gzip +import org.jetbrains.kotlinx.dataframe.io.CsvCompression.None +import org.jetbrains.kotlinx.dataframe.io.CsvCompression.Zip import org.jetbrains.kotlinx.dataframe.io.isURL import org.jetbrains.kotlinx.dataframe.io.readJson import java.io.File import java.io.InputStream import java.net.HttpURLConnection import java.net.URL +import java.util.zip.ZipInputStream -internal fun isCompressed(fileOrUrl: String) = listOf("gz", "zip").contains(fileOrUrl.split(".").last()) +internal fun compressionStateOf(fileOrUrl: String): CsvCompression<*> = + when (fileOrUrl.split(".").last()) { + "gz" -> CsvCompression.Gzip + "zip" -> CsvCompression.Zip + else -> CsvCompression.None + } -internal fun isCompressed(file: File) = listOf("gz", "zip").contains(file.extension) +internal fun compressionStateOf(file: File): CsvCompression<*> = + when (file.extension) { + "gz" -> CsvCompression.Gzip + "zip" -> CsvCompression.Zip + else -> CsvCompression.None + } -internal fun isCompressed(url: URL) = isCompressed(url.path) +internal fun compressionStateOf(url: URL): CsvCompression<*> = compressionStateOf(url.path) internal fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFrame { val connection = url.openConnection() @@ -42,5 +59,41 @@ public fun asURL(fileOrUrl: String): URL = if (isURL(fileOrUrl)) { URL(fileOrUrl).toURI() } else { - File(fileOrUrl).toURI() + File(fileOrUrl).also { + require(it.exists()) { "File not found: \"$fileOrUrl\"" } + require(it.isFile) { "Not a file: \"$fileOrUrl\"" } + }.toURI() }.toURL() + +internal inline fun InputStream.useSafely(compression: CsvCompression<*>, block: (InputStream) -> T): T { + var zipInputStream: ZipInputStream? = null + + // first wrap the stream in the compression algorithm + val unpackedStream = when (compression) { + None -> this + + Zip -> compression(this).also { + it as ZipInputStream + // make sure to call nextEntry once to prepare the stream + if (it.nextEntry == null) error("No entries in zip file") + + zipInputStream = it + } + + Gzip -> compression(this) + + is Custom<*> -> compression(this) + } + + val bomSafeStream = BOMInputStream.builder().setInputStream(unpackedStream).get() + + try { + return block(bomSafeStream) + } finally { + // if we were reading from a ZIP, make sure there was only one entry, as to + // warn the user of potential issues + if (compression == Zip && zipInputStream!!.nextEntry != null) { + throw IllegalArgumentException("Zip file contains more than one entry") + } + } +} diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readCsvOrTsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readCsvOrTsv.kt index 587b0b0157..25784743f2 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readCsvOrTsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readCsvOrTsv.kt @@ -23,7 +23,6 @@ import kotlinx.datetime.Instant import kotlinx.datetime.LocalDate import kotlinx.datetime.LocalDateTime import kotlinx.datetime.LocalTime -import org.apache.commons.io.input.BOMInputStream import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow @@ -34,11 +33,11 @@ import org.jetbrains.kotlinx.dataframe.api.tryParse import org.jetbrains.kotlinx.dataframe.columns.ValueColumn import org.jetbrains.kotlinx.dataframe.impl.ColumnNameGenerator import org.jetbrains.kotlinx.dataframe.io.ColType +import org.jetbrains.kotlinx.dataframe.io.CsvCompression import org.jetbrains.kotlinx.dataframe.io.DEFAULT_COL_TYPE import java.io.InputStream import java.math.BigDecimal import java.net.URL -import java.util.zip.GZIPInputStream import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf @@ -49,7 +48,7 @@ import kotlin.time.Duration * @include [CsvTsvParams.INPUT_STREAM] * @param delimiter The field delimiter character. The default is ',' for CSV, '\t' for TSV. * @include [CsvTsvParams.HEADER] - * @include [CsvTsvParams.IS_COMPRESSED] + * @include [CsvTsvParams.COMPRESSION] * @include [CsvTsvParams.COL_TYPES] * @include [CsvTsvParams.SKIP_LINES] * @include [CsvTsvParams.READ_LINES] @@ -67,7 +66,7 @@ internal fun readCsvOrTsvImpl( inputStream: InputStream, delimiter: Char, header: List = CsvTsvParams.HEADER, - isCompressed: Boolean = CsvTsvParams.IS_COMPRESSED, + compression: CsvCompression<*> = CsvTsvParams.COMPRESSION, colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -115,32 +114,30 @@ internal fun readCsvOrTsvImpl( colTypes(colTypes, useDeepHavenLocalDateTime) // this function must be last, so the return value is used }.build() - val adjustedInputStream = inputStream - .let { if (isCompressed) GZIPInputStream(it) else it } - .let { BOMInputStream.builder().setInputStream(it).get() } - - if (adjustedInputStream.available() <= 0) { - return if (header.isEmpty()) { - DataFrame.empty() - } else { - dataFrameOf( - header.map { - DataColumn.createValueColumn( - name = it, - values = emptyList(), - type = typeOf(), - ) - }, - ) + val csvReaderResult = inputStream.useSafely(compression) { safeInputStream -> + if (safeInputStream.available() <= 0) { + return if (header.isEmpty()) { + DataFrame.empty() + } else { + dataFrameOf( + header.map { + DataColumn.createValueColumn( + name = it, + values = emptyList(), + type = typeOf(), + ) + }, + ) + } } - } - // read the csv - val csvReaderResult = CsvReader.read( - csvSpecs, - adjustedInputStream, - ListSink.SINK_FACTORY, - ) + // read the csv + CsvReader.read( + csvSpecs, + safeInputStream, + ListSink.SINK_FACTORY, + ) + } val defaultColType = colTypes[DEFAULT_COL_TYPE] diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeCsvOrTsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeCsvOrTsv.kt index a402757e32..2ff72d032f 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeCsvOrTsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/writeCsvOrTsv.kt @@ -18,7 +18,7 @@ internal fun writeCsvOrTsvImpl( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, additionalCsvFormat: CSVFormat = CsvTsvParams.ADDITIONAL_CSV_FORMAT, ) { val format = with(CSVFormat.Builder.create(additionalCsvFormat)) { diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvCompression.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvCompression.kt new file mode 100644 index 0000000000..47fa8bfc90 --- /dev/null +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvCompression.kt @@ -0,0 +1,24 @@ +package org.jetbrains.kotlinx.dataframe.io + +import java.io.InputStream +import java.util.zip.GZIPInputStream +import java.util.zip.ZipInputStream + +/** + * Compression algorithm to use when reading csv files. + * We support GZIP and ZIP compression out of the box. + * + * Custom compression algorithms can be added by creating an instance of [Custom]. + */ +public sealed class CsvCompression(public open val wrapStream: (InputStream) -> I) : + (InputStream) -> I by wrapStream { + + public data object Gzip : CsvCompression(::GZIPInputStream) + + public data object Zip : CsvCompression(::ZipInputStream) + + public data object None : CsvCompression({ it }) + + public data class Custom(override val wrapStream: (InputStream) -> I) : + CsvCompression(wrapStream) +} diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readCsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readCsv.kt index 88b704bab2..d0b7d03782 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readCsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readCsv.kt @@ -6,7 +6,7 @@ import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.impl.io.CsvTsvParams import org.jetbrains.kotlinx.dataframe.impl.io.asURL import org.jetbrains.kotlinx.dataframe.impl.io.catchHttpResponse -import org.jetbrains.kotlinx.dataframe.impl.io.isCompressed +import org.jetbrains.kotlinx.dataframe.impl.io.compressionStateOf import org.jetbrains.kotlinx.dataframe.impl.io.readCsvOrTsvImpl import java.io.File import java.io.FileInputStream @@ -24,6 +24,7 @@ public fun DataFrame.Companion.readCsv( file: File, delimiter: Char = CsvTsvParams.CSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(file), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -41,7 +42,7 @@ public fun DataFrame.Companion.readCsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(file), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -61,6 +62,7 @@ public fun DataFrame.Companion.readCsv( url: URL, delimiter: Char = CsvTsvParams.CSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(url), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -78,7 +80,7 @@ public fun DataFrame.Companion.readCsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(url), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -98,6 +100,7 @@ public fun DataFrame.Companion.readCsv( fileOrUrl: String, delimiter: Char = CsvTsvParams.CSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(fileOrUrl), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -115,7 +118,7 @@ public fun DataFrame.Companion.readCsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(fileOrUrl), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -136,7 +139,7 @@ public fun DataFrame.Companion.readCsv( inputStream: InputStream, delimiter: Char = CsvTsvParams.CSV_DELIMITER, header: List = CsvTsvParams.HEADER, - isCompressed: Boolean = CsvTsvParams.IS_COMPRESSED, + compression: CsvCompression<*> = CsvTsvParams.COMPRESSION, colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -154,7 +157,7 @@ public fun DataFrame.Companion.readCsv( inputStream = inputStream, delimiter = delimiter, header = header, - isCompressed = isCompressed, + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -174,7 +177,7 @@ public fun DataFrame.Companion.readCsvStr( text: String, delimiter: Char = CsvTsvParams.CSV_DELIMITER, header: List = CsvTsvParams.HEADER, - isCompressed: Boolean = CsvTsvParams.IS_COMPRESSED, + compression: CsvCompression<*> = CsvTsvParams.COMPRESSION, colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -191,7 +194,7 @@ public fun DataFrame.Companion.readCsvStr( inputStream = text.byteInputStream(), delimiter = delimiter, header = header, - isCompressed = isCompressed, + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readTsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readTsv.kt index 60e980d077..e48b4adbbd 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readTsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/readTsv.kt @@ -6,7 +6,7 @@ import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.impl.io.CsvTsvParams import org.jetbrains.kotlinx.dataframe.impl.io.asURL import org.jetbrains.kotlinx.dataframe.impl.io.catchHttpResponse -import org.jetbrains.kotlinx.dataframe.impl.io.isCompressed +import org.jetbrains.kotlinx.dataframe.impl.io.compressionStateOf import org.jetbrains.kotlinx.dataframe.impl.io.readCsvOrTsvImpl import java.io.File import java.io.FileInputStream @@ -18,6 +18,7 @@ public fun DataFrame.Companion.readTsv( file: File, delimiter: Char = CsvTsvParams.TSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(file), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -35,7 +36,7 @@ public fun DataFrame.Companion.readTsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(file), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -55,6 +56,7 @@ public fun DataFrame.Companion.readTsv( url: URL, delimiter: Char = CsvTsvParams.TSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(url), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -72,7 +74,7 @@ public fun DataFrame.Companion.readTsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(url), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -92,6 +94,7 @@ public fun DataFrame.Companion.readTsv( fileOrUrl: String, delimiter: Char = CsvTsvParams.TSV_DELIMITER, header: List = CsvTsvParams.HEADER, + compression: CsvCompression<*> = compressionStateOf(fileOrUrl), colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -109,7 +112,7 @@ public fun DataFrame.Companion.readTsv( inputStream = it, delimiter = delimiter, header = header, - isCompressed = isCompressed(fileOrUrl), + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -130,7 +133,7 @@ public fun DataFrame.Companion.readTsv( inputStream: InputStream, delimiter: Char = CsvTsvParams.TSV_DELIMITER, header: List = CsvTsvParams.HEADER, - isCompressed: Boolean = CsvTsvParams.IS_COMPRESSED, + compression: CsvCompression<*> = CsvTsvParams.COMPRESSION, colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -148,7 +151,7 @@ public fun DataFrame.Companion.readTsv( inputStream = inputStream, delimiter = delimiter, header = header, - isCompressed = isCompressed, + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, @@ -168,7 +171,7 @@ public fun DataFrame.Companion.readTsvStr( text: String, delimiter: Char = CsvTsvParams.TSV_DELIMITER, header: List = CsvTsvParams.HEADER, - isCompressed: Boolean = CsvTsvParams.IS_COMPRESSED, + compression: CsvCompression<*> = CsvTsvParams.COMPRESSION, colTypes: Map = CsvTsvParams.COL_TYPES, skipLines: Long = CsvTsvParams.SKIP_LINES, readLines: Long? = CsvTsvParams.READ_LINES, @@ -185,7 +188,7 @@ public fun DataFrame.Companion.readTsvStr( inputStream = text.byteInputStream(), delimiter = delimiter, header = header, - isCompressed = isCompressed, + compression = compression, colTypes = colTypes, skipLines = skipLines, readLines = readLines, diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeCsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeCsv.kt index eb62c7808c..c4a8e42e9c 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeCsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeCsv.kt @@ -17,7 +17,7 @@ public fun AnyFrame.writeCsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): Unit = writeCsvOrTsvImpl( df = this, @@ -42,7 +42,7 @@ public fun AnyFrame.writeCsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): Unit = writeCsvOrTsvImpl( df = this, @@ -68,7 +68,7 @@ public fun AnyFrame.writeCsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, additionalCsvFormat: CSVFormat = CsvTsvParams.ADDITIONAL_CSV_FORMAT, ): Unit = writeCsvOrTsvImpl( @@ -94,7 +94,7 @@ public fun AnyFrame.toCsvStr( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): String = buildString { writeCsvOrTsvImpl( diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeTsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeTsv.kt index d1a501841d..33dfa444ef 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeTsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/writeTsv.kt @@ -17,7 +17,7 @@ public fun AnyFrame.writeTsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): Unit = writeCsvOrTsvImpl( df = this, @@ -42,7 +42,7 @@ public fun AnyFrame.writeTsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): Unit = writeCsvOrTsvImpl( df = this, @@ -68,7 +68,7 @@ public fun AnyFrame.writeTsv( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, additionalCsvFormat: CSVFormat = CsvTsvParams.ADDITIONAL_CSV_FORMAT, ): Unit = writeCsvOrTsvImpl( @@ -94,7 +94,7 @@ public fun AnyFrame.toTsvStr( escapeChar: Char? = CsvTsvParams.ESCAPE_CHAR, commentChar: Char? = CsvTsvParams.COMMENT_CHAR, headerComments: List = CsvTsvParams.HEADER_COMMENTS, - recordSeparator: Char = CsvTsvParams.RECORD_SEPARATOR, + recordSeparator: String = CsvTsvParams.RECORD_SEPARATOR, ): String = buildString { writeCsvOrTsvImpl( diff --git a/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTsvTests.kt b/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTsvTests.kt index a53b0b1ac3..45f4209ce6 100644 --- a/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTsvTests.kt +++ b/dataframe-csv/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTsvTests.kt @@ -1,6 +1,7 @@ package org.jetbrains.kotlinx.dataframe.io import io.kotest.assertions.throwables.shouldNotThrowAny +import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.nulls.shouldNotBeNull import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime @@ -22,6 +23,7 @@ import java.io.File import java.io.StringWriter import java.net.URL import java.util.Locale +import java.util.zip.GZIPInputStream import kotlin.reflect.KClass import kotlin.reflect.typeOf @@ -77,6 +79,62 @@ class CsvTsvTests { df.print(columnTypes = true, borders = true, title = true) } + @Test + fun `read ZIP Csv`() { + val df = DataFrame.readCsv(simpleCsvZip) + + df.columnsCount() shouldBe 11 + df.rowsCount() shouldBe 5 + df.columnNames()[5] shouldBe "duplicate1" + df.columnNames()[6] shouldBe "duplicate11" + df["duplicate1"].type() shouldBe typeOf() + df["double"].type() shouldBe typeOf() + df["number"].type() shouldBe typeOf() + df["time"].type() shouldBe typeOf() + + df.print(columnTypes = true, borders = true, title = true) + } + + @Test + fun `read GZ Csv`() { + val df = DataFrame.readCsv(simpleCsvGz) + + df.columnsCount() shouldBe 11 + df.rowsCount() shouldBe 5 + df.columnNames()[5] shouldBe "duplicate1" + df.columnNames()[6] shouldBe "duplicate11" + df["duplicate1"].type() shouldBe typeOf() + df["double"].type() shouldBe typeOf() + df["number"].type() shouldBe typeOf() + df["time"].type() shouldBe typeOf() + + df.print(columnTypes = true, borders = true, title = true) + } + + @Test + fun `read custom compression Csv`() { + val df = DataFrame.readCsv( + simpleCsvGz, + compression = CsvCompression.Custom { GZIPInputStream(it) }, + ) + + df.columnsCount() shouldBe 11 + df.rowsCount() shouldBe 5 + df.columnNames()[5] shouldBe "duplicate1" + df.columnNames()[6] shouldBe "duplicate11" + df["duplicate1"].type() shouldBe typeOf() + df["double"].type() shouldBe typeOf() + df["number"].type() shouldBe typeOf() + df["time"].type() shouldBe typeOf() + + df.print(columnTypes = true, borders = true, title = true) + } + + @Test + fun `read 2 compressed Csv`() { + shouldThrow { DataFrame.readCsv(twoCsvsZip) } + } + @Test fun readCsvWithFrenchLocaleAndAlternativeDelimiter() { val df = DataFrame.readCsv( @@ -247,10 +305,9 @@ class CsvTsvTests { 1, 3, 2, ) df.writeCsv( - "src/test/resources/without_header.csv", -// CSVFormat.DEFAULT.builder() -// .setSkipHeaderRecord(true) -// .build(), + path = "src/test/resources/without_header.csv", + includeHeader = false, + recordSeparator = "\r\n", ) val producedFile = File("src/test/resources/without_header.csv") producedFile.exists() shouldBe true @@ -260,7 +317,10 @@ class CsvTsvTests { @Test fun `check integrity of example data`() { - val df = DataFrame.readCsv("../data/jetbrains_repositories.csv") + val df = DataFrame.readCsv( + "../data/jetbrains_repositories.csv", + skipLines = 1, // now needs this, ignoreEmptyLines cannot catch it + ) df.columnNames() shouldBe listOf("full_name", "html_url", "stargazers_count", "topics", "watchers") df.columnTypes() shouldBe listOf( typeOf(), @@ -269,7 +329,10 @@ class CsvTsvTests { typeOf(), typeOf(), ) - df shouldBe DataFrame.readCsv("../data/jetbrains repositories.csv") + df shouldBe DataFrame.readCsv( + "../data/jetbrains repositories.csv", + skipLines = 1, // now needs this, ignoreEmptyLines cannot catch it + ) } @Test @@ -334,8 +397,17 @@ class CsvTsvTests { df shouldBe dataFrameOf("a", "b", "c")(1, 2, 3) } + @Test + fun `don't read folder`() { + shouldThrow { DataFrame.readCsv("") } + shouldThrow { DataFrame.readCsv("NON EXISTENT FILE") } + } + companion object { private val simpleCsv = testCsv("testCSV") + private val simpleCsvZip = testResource("testCSV.zip") + private val twoCsvsZip = testResource("two csvs.zip") + private val simpleCsvGz = testResource("testCSV.csv.gz") private val csvWithFrenchLocale = testCsv("testCSVwithFrenchLocale") private val wineCsv = testCsv("wine") private val durationCsv = testCsv("duration") diff --git a/dataframe-csv/src/test/resources/testCSV.csv.gz b/dataframe-csv/src/test/resources/testCSV.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..91a6514edb0943b725cbdbc3f928367b5a3584f8 GIT binary patch literal 265 zcmV+k0rvhMiwFqXPc&ly19W9`bVE~CE@N|c0A0^bZi6ro1>k*7!2urW%-A&GU00E^ zh${5}l>xU!F+ZVL6}PVgltm?LWR2$2n?d+?<9k!1v#moN2H#Y6a(J}=we}yQ?nCJv zy3m$x#MHz&*ACNsAZVOjKYlvEmKAKA@g6LskQK#ISV2fQrT9(>XNJWnS*8w*46inL zV0hF;F6H|40%09B@0xYO+Nfv4#i|84F8x>6&A|P~jnj;BOaFaeoatA2+k#+yUO~94vfp z2CspE(ZI;S5a7+oB*KgcMPxZ{1_^kW{%-^^@r5ZX10yI*1H4(;KuQ^bFc3()04-r) F003ckl6e3C literal 0 HcmV?d00001 diff --git a/dataframe-csv/src/test/resources/two csvs.zip b/dataframe-csv/src/test/resources/two csvs.zip new file mode 100644 index 0000000000000000000000000000000000000000..ac765bcf481a940e46897c1d34d78f97b5ea283c GIT binary patch literal 1451 zcmWIWW@Zs#-~htl_9>wZNPwR~fg!Ufv)CoEB(XTPL@&9xth9ogfsqBI226y8@G`Kk zzr8SdLi^veJ|M0QyPfygfur_)`j4BNOvKZU-QO5=e{tTGUXiV#Qg2t)?@vC^$*W$u zI?>|5LkFuG`Tcy)U)KDo*#A%Rn4{hQxpkkfU;ntP^wjfLf2w!P>AODvVe2x3Wu3UifdWRo#n>zy8gWp4Q%4 zoYFpbx2_S_ikh{mH`a(gFgdD!Pwz!ywr>tE+wNX&CzIN1hU=%MU#U0xc2?(-ocdRu zGvS-w&wilZJ^yL#miMbyyH~s~W?Daafxx5eZJWzvF3r6eJ3sKZNmgQY-6AUk#d#%+ zznu0Qx8q;J^Xl7=7|&&^IhOg}ntHeQ=RVfN_4l8|alejxEB$86(FOaMId2%>xh%T$ zbLfP=w8h=G*&MFk%oI7Bx!v~QUggm1|F-&Aef}MH>h;G@wN<}u>f_p`FY-;D+E;jL zjnVt5g_AB#nJFN=eaEkFpITyKLe&?ZI3nw@HuZhXsg9WZ{ENp#_eD=KyxFtSdi6YZ zXNUD6&(f2p1?w_xTT`-O$Cg}gjwq%DGq>z(d8Hi_ePJ`3lCa$QA_e97haYrz2C4Q% zcPTz)Ha$17TT1%X#)mW2*;=D^9yK(-=pE4E%@KBs{ftQ7{>-K4Zd%HA@~@qJ?@z~c z*O{!R8$+si1GibN?$4HfdGpv!*Qy-fm^zcvpO)vVisU#}E|OHad7aZIKQz)oZ_+`# zBGJ~8WL}=g3Cm_5`Khr}GvTo7meBA^qCxq$vluH^823#!tyHj0x;W`y`lknNjEAmC zt9hI4nWy)DdH?0x57)6t{@n99bXoV0Bdv4Y+06TogqS^6?#tTqwt7$Mq9XC{6-92I zsv*8po@n`WoDH??%bT7kE&g+x=+iS-Z~Q&8{A0>fWlKA*|9)MEni2zF8)Pi`?(l3{ ztlCP)X$*xP3$%aA2U>0ab84f;nPrYouD^blXM8rwwbXFY`(xjQQ!i;X7_GV&W53q5 zb&_^N=25|ynt6GvzMgbi91}V7d)DKaRWp-pB7ZyGHqibonU!|7^+(38_-Vfnd|q_h z(Wp4=ZB(v`-S$bmnOY7{Pnf#OoRgY-a9;S=g)!N4)_=Si`cOCNDC5+3B1tzN&G!#$ zPIuXM{`n%)W0_tSXBK>$ncS58%09rGonzr9OD#WU28IrP28IB5R-IdGgOXLbfmyXA zwYbDNI1E=dz2$EL&!(~Gee;?P1Xv%``WOaFaeoatA2+k# z+@LUI;d3*14J@-77(rpk$RxsyDDaTOm!ClbUi|%U1ThH}f2<6QpyChNP*Bt$TgZ)K f=xSh8;)^JtfuM*A@MdKL+0O)o%YgKIaB&L&wK{Y* literal 0 HcmV?d00001