From ee485548d7150d35da41b93fb420595dd1d176ce Mon Sep 17 00:00:00 2001 From: Alec Huang Date: Fri, 18 Oct 2024 14:25:06 -0700 Subject: [PATCH] SNOW-1727532 Set number of values for repeated fields (#861) --- .../streaming/internal/AbstractRowBuffer.java | 23 +- .../streaming/internal/BlobBuilder.java | 20 +- .../streaming/internal/ChannelData.java | 2 +- .../streaming/internal/ChunkMetadata.java | 16 +- .../internal/ClientBufferParameters.java | 34 ++- .../ingest/streaming/internal/EpInfo.java | 12 +- .../internal/FileColumnProperties.java | 19 ++ .../streaming/internal/FlushService.java | 6 +- .../ingest/streaming/internal/Flusher.java | 5 +- .../internal/IcebergParquetValueParser.java | 29 ++- .../internal/InternalParameterProvider.java | 13 ++ .../streaming/internal/ParquetFlusher.java | 11 +- .../streaming/internal/ParquetRowBuffer.java | 87 +++++-- .../streaming/internal/RowBufferStats.java | 105 +++++++-- .../net/snowflake/ingest/utils/Constants.java | 2 + .../ingest/utils/IcebergDataTypeParser.java | 6 +- .../ingest/utils/ParameterProvider.java | 42 ++-- .../ingest/utils/SubColumnFinder.java | 69 ++++++ .../net/snowflake/ingest/utils/Utils.java | 32 ++- .../parquet/hadoop/BdecParquetReader.java | 10 +- ...riter.java => SnowflakeParquetWriter.java} | 37 ++- .../streaming/internal/BlobBuilderTest.java | 122 +++++++++- .../streaming/internal/ChannelDataTest.java | 35 ++- .../internal/FileColumnPropertiesTest.java | 12 +- .../streaming/internal/FlushServiceTest.java | 22 +- .../IcebergParquetValueParserTest.java | 147 ++++++++---- .../internal/RowBufferStatsTest.java | 109 ++++++--- .../streaming/internal/RowBufferTest.java | 218 ++++++++++++++++-- .../SnowflakeParquetValueParserTest.java | 44 ++-- .../SnowflakeStreamingIngestClientTest.java | 16 +- .../datatypes/AbstractDataTypeTest.java | 37 +-- .../datatypes/IcebergNumericTypesIT.java | 4 + .../datatypes/IcebergStructuredIT.java | 18 +- .../ingest/utils/SubColumnFinderTest.java | 140 +++++++++++ 34 files changed, 1237 insertions(+), 267 deletions(-) create mode 100644 src/main/java/net/snowflake/ingest/utils/SubColumnFinder.java rename src/main/java/org/apache/parquet/hadoop/{BdecParquetWriter.java => SnowflakeParquetWriter.java} (91%) create mode 100644 src/test/java/net/snowflake/ingest/utils/SubColumnFinderTest.java diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index fa502f30a..69558d5d9 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Snowflake Computing Inc. All rights reserved. + * Copyright (c) 2022-2024 Snowflake Computing Inc. All rights reserved. */ package net.snowflake.ingest.streaming.internal; @@ -292,6 +292,9 @@ public InsertValidationResponse insertRows( // Temp stats map to use until all the rows are validated @VisibleForTesting Map tempStatsMap; + // Map of the column name to the column object, used for null/missing column check + protected final Map fieldIndex; + // Lock used to protect the buffers from concurrent read/write private final Lock flushLock; @@ -352,6 +355,8 @@ public InsertValidationResponse insertRows( // Initialize empty stats this.statsMap = new HashMap<>(); this.tempStatsMap = new HashMap<>(); + + this.fieldIndex = new HashMap<>(); } /** @@ -427,7 +432,7 @@ Set verifyInputColumns( List missingCols = new ArrayList<>(); for (String columnName : this.nonNullableFieldNames) { if (!inputColNamesMap.containsKey(columnName)) { - missingCols.add(statsMap.get(columnName).getColumnDisplayName()); + missingCols.add(fieldIndex.get(columnName).columnMetadata.getName()); } } @@ -447,7 +452,7 @@ Set verifyInputColumns( for (String columnName : this.nonNullableFieldNames) { if (inputColNamesMap.containsKey(columnName) && row.get(inputColNamesMap.get(columnName)) == null) { - nullValueNotNullCols.add(statsMap.get(columnName).getColumnDisplayName()); + nullValueNotNullCols.add(fieldIndex.get(columnName).columnMetadata.getName()); } } @@ -642,13 +647,17 @@ public synchronized void close(String name) { * * @param rowCount: count of rows in the given buffer * @param colStats: map of column name to RowBufferStats - * @param setAllDefaultValues: whether to set default values for all null fields the EPs - * irrespective of the data type of this column + * @param setAllDefaultValues: whether to set default values for all null min/max field in the EPs + * @param enableDistinctValuesCount: whether to include valid NDV in the EPs irrespective of the + * data type of this column * @return the EPs built from column stats */ static EpInfo buildEpInfoFromStats( - long rowCount, Map colStats, boolean setAllDefaultValues) { - EpInfo epInfo = new EpInfo(rowCount, new HashMap<>()); + long rowCount, + Map colStats, + boolean setAllDefaultValues, + boolean enableDistinctValuesCount) { + EpInfo epInfo = new EpInfo(rowCount, new HashMap<>(), enableDistinctValuesCount); for (Map.Entry colStat : colStats.entrySet()) { RowBufferStats stat = colStat.getValue(); FileColumnProperties dto = new FileColumnProperties(stat, setAllDefaultValues); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/BlobBuilder.java b/src/main/java/net/snowflake/ingest/streaming/internal/BlobBuilder.java index 3e1de452a..060af357f 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/BlobBuilder.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/BlobBuilder.java @@ -11,6 +11,7 @@ import static net.snowflake.ingest.utils.Constants.BLOB_NO_HEADER; import static net.snowflake.ingest.utils.Constants.BLOB_TAG_SIZE_IN_BYTES; import static net.snowflake.ingest.utils.Constants.BLOB_VERSION_SIZE_IN_BYTES; +import static net.snowflake.ingest.utils.Utils.getParquetFooterSize; import static net.snowflake.ingest.utils.Utils.toByteArray; import com.fasterxml.jackson.core.JsonProcessingException; @@ -29,10 +30,11 @@ import javax.crypto.NoSuchPaddingException; import net.snowflake.ingest.utils.Constants; import net.snowflake.ingest.utils.Cryptor; +import net.snowflake.ingest.utils.ErrorCode; import net.snowflake.ingest.utils.Logging; import net.snowflake.ingest.utils.Pair; +import net.snowflake.ingest.utils.SFException; import org.apache.commons.codec.binary.Hex; -import org.apache.parquet.hadoop.ParquetFileWriter; /** * Build a single blob file that contains file header plus data. The header will be a @@ -135,17 +137,27 @@ static Blob constructBlobAndMetadata( AbstractRowBuffer.buildEpInfoFromStats( serializedChunk.rowCount, serializedChunk.columnEpStatsMapCombined, - internalParameterProvider.setAllDefaultValuesInEp())) + internalParameterProvider.setAllDefaultValuesInEp(), + internalParameterProvider.isEnableDistinctValuesCount())) .setFirstInsertTimeInMs(serializedChunk.chunkMinMaxInsertTimeInMs.getFirst()) .setLastInsertTimeInMs(serializedChunk.chunkMinMaxInsertTimeInMs.getSecond()); if (internalParameterProvider.setIcebergSpecificFieldsInEp()) { + if (internalParameterProvider.getEnableChunkEncryption()) { + /* metadata size computation only works when encryption and padding is off */ + throw new SFException( + ErrorCode.INTERNAL_ERROR, + "Metadata size computation is only supported when encryption is enabled"); + } + final long metadataSize = getParquetFooterSize(compressedChunkData); + final long extendedMetadataSize = serializedChunk.extendedMetadataSize; chunkMetadataBuilder - .setMajorVersion(ParquetFileWriter.CURRENT_VERSION) + .setMajorVersion(Constants.PARQUET_MAJOR_VERSION) .setMinorVersion(Constants.PARQUET_MINOR_VERSION) // set createdOn in seconds .setCreatedOn(System.currentTimeMillis() / 1000) - .setExtendedMetadataSize(-1L); + .setMetadataSize(metadataSize) + .setExtendedMetadataSize(extendedMetadataSize); } ChunkMetadata chunkMetadata = chunkMetadataBuilder.build(); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ChannelData.java b/src/main/java/net/snowflake/ingest/streaming/internal/ChannelData.java index 81f49d2fd..3ad8855cc 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ChannelData.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ChannelData.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Snowflake Computing Inc. All rights reserved. + * Copyright (c) 2021-2024 Snowflake Computing Inc. All rights reserved. */ package net.snowflake.ingest.streaming.internal; diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ChunkMetadata.java b/src/main/java/net/snowflake/ingest/streaming/internal/ChunkMetadata.java index 006782d25..e180960ce 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ChunkMetadata.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ChunkMetadata.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Snowflake Computing Inc. All rights reserved. + * Copyright (c) 2021-2024 Snowflake Computing Inc. All rights reserved. */ package net.snowflake.ingest.streaming.internal; @@ -26,6 +26,7 @@ class ChunkMetadata { private Integer majorVersion; private Integer minorVersion; private Long createdOn; + private Long metadataSize; private Long extendedMetadataSize; static Builder builder() { @@ -51,6 +52,7 @@ static class Builder { private Integer majorVersion; private Integer minorVersion; private Long createdOn; + private Long metadataSize; private Long extendedMetadataSize; Builder setOwningTableFromChannelContext(ChannelFlushContext channelFlushContext) { @@ -124,6 +126,11 @@ Builder setCreatedOn(Long createdOn) { return this; } + Builder setMetadataSize(Long metadataSize) { + this.metadataSize = metadataSize; + return this; + } + Builder setExtendedMetadataSize(Long extendedMetadataSize) { this.extendedMetadataSize = extendedMetadataSize; return this; @@ -165,6 +172,7 @@ private ChunkMetadata(Builder builder) { this.majorVersion = builder.majorVersion; this.minorVersion = builder.minorVersion; this.createdOn = builder.createdOn; + this.metadataSize = builder.metadataSize; this.extendedMetadataSize = builder.extendedMetadataSize; } @@ -258,6 +266,12 @@ Long getCreatedOn() { return this.createdOn; } + @JsonProperty("metadata_size") + @JsonInclude(JsonInclude.Include.NON_NULL) + Long getMetadataSize() { + return this.metadataSize; + } + @JsonProperty("ext_metadata_size") @JsonInclude(JsonInclude.Include.NON_NULL) Long getExtendedMetadataSize() { diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ClientBufferParameters.java b/src/main/java/net/snowflake/ingest/streaming/internal/ClientBufferParameters.java index 9009642b3..36a6db282 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ClientBufferParameters.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ClientBufferParameters.java @@ -25,6 +25,10 @@ public class ClientBufferParameters { private boolean isIcebergMode; + private boolean enableDistinctValuesCount; + + private boolean enableValuesCount; + /** * Private constructor used for test methods * @@ -38,13 +42,17 @@ private ClientBufferParameters( Constants.BdecParquetCompression bdecParquetCompression, boolean enableNewJsonParsingLogic, Optional maxRowGroups, - boolean isIcebergMode) { + boolean isIcebergMode, + boolean enableDistinctValuesCount, + boolean enableValuesCount) { this.maxChunkSizeInBytes = maxChunkSizeInBytes; this.maxAllowedRowSizeInBytes = maxAllowedRowSizeInBytes; this.bdecParquetCompression = bdecParquetCompression; this.enableNewJsonParsingLogic = enableNewJsonParsingLogic; this.maxRowGroups = maxRowGroups; this.isIcebergMode = isIcebergMode; + this.enableDistinctValuesCount = enableDistinctValuesCount; + this.enableValuesCount = enableValuesCount; } /** @param clientInternal reference to the client object where the relevant parameters are set */ @@ -73,6 +81,14 @@ public ClientBufferParameters(SnowflakeStreamingIngestClientInternal clientInter isIcebergMode ? Optional.of(InternalParameterProvider.MAX_ROW_GROUP_COUNT_ICEBERG_MODE_DEFAULT) : Optional.empty(); + this.enableDistinctValuesCount = + clientInternal != null + ? clientInternal.getInternalParameterProvider().isEnableDistinctValuesCount() + : InternalParameterProvider.ENABLE_DISTINCT_VALUES_COUNT_DEFAULT; + this.enableValuesCount = + clientInternal != null + ? clientInternal.getInternalParameterProvider().isEnableValuesCount() + : InternalParameterProvider.ENABLE_VALUES_COUNT_DEFAULT; } /** @@ -87,14 +103,18 @@ public static ClientBufferParameters test_createClientBufferParameters( Constants.BdecParquetCompression bdecParquetCompression, boolean enableNewJsonParsingLogic, Optional maxRowGroups, - boolean isIcebergMode) { + boolean isIcebergMode, + boolean enableDistinctValuesCount, + boolean enableValuesCount) { return new ClientBufferParameters( maxChunkSizeInBytes, maxAllowedRowSizeInBytes, bdecParquetCompression, enableNewJsonParsingLogic, maxRowGroups, - isIcebergMode); + isIcebergMode, + enableDistinctValuesCount, + enableValuesCount); } public long getMaxChunkSizeInBytes() { @@ -125,6 +145,14 @@ public String getParquetMessageTypeName() { return isIcebergMode ? PARQUET_MESSAGE_TYPE_NAME : BDEC_PARQUET_MESSAGE_TYPE_NAME; } + public boolean isEnableDistinctValuesCount() { + return enableDistinctValuesCount; + } + + public boolean isEnableValuesCount() { + return enableValuesCount; + } + public boolean isEnableDictionaryEncoding() { return isIcebergMode; } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/EpInfo.java b/src/main/java/net/snowflake/ingest/streaming/internal/EpInfo.java index e6e6a4d9d..28203cfd5 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/EpInfo.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/EpInfo.java @@ -13,12 +13,18 @@ class EpInfo { private Map columnEps; + private boolean enableDistinctValuesCount; + /** Default constructor, needed for Jackson */ EpInfo() {} - EpInfo(long rowCount, Map columnEps) { + EpInfo( + long rowCount, + Map columnEps, + boolean enableDistinctValuesCount) { this.rowCount = rowCount; this.columnEps = columnEps; + this.enableDistinctValuesCount = enableDistinctValuesCount; } /** Some basic verification logic to make sure the EP info is correct */ @@ -35,8 +41,8 @@ public void verifyEpInfo() { colName, colEp.getNullCount(), rowCount)); } - // Make sure the NDV should always be -1 - if (colEp.getDistinctValues() != EP_NDV_UNKNOWN) { + // Make sure the NDV should always be -1 when the NDV set to default + if (!enableDistinctValuesCount && colEp.getDistinctValues() != EP_NDV_UNKNOWN) { throw new SFException( ErrorCode.INTERNAL_ERROR, String.format( diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FileColumnProperties.java b/src/main/java/net/snowflake/ingest/streaming/internal/FileColumnProperties.java index b3c7aedf5..d788bcfd9 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FileColumnProperties.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FileColumnProperties.java @@ -5,6 +5,7 @@ package net.snowflake.ingest.streaming.internal; import static net.snowflake.ingest.streaming.internal.BinaryStringUtils.truncateBytesAsHex; +import static net.snowflake.ingest.utils.Constants.EP_NV_UNKNOWN; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; @@ -44,6 +45,9 @@ class FileColumnProperties { private long nullCount; + // for elements in repeated columns + private Long numberOfValues; + // for binary or string columns private long maxLength; @@ -110,6 +114,10 @@ class FileColumnProperties { this.setMinStrNonCollated(null); this.setNullCount(stats.getCurrentNullCount()); this.setDistinctValues(stats.getDistinctValues()); + + if (stats.getNumberOfValues() != EP_NV_UNKNOWN) { + this.setNumberOfValues(stats.getNumberOfValues()); + } } private void setIntValues(RowBufferStats stats) { @@ -284,6 +292,16 @@ void setMaxStrNonCollated(String maxStrNonCollated) { this.maxStrNonCollated = maxStrNonCollated; } + @JsonProperty("numberOfValues") + @JsonInclude(JsonInclude.Include.NON_NULL) + Long getNumberOfValues() { + return numberOfValues; + } + + void setNumberOfValues(Long numberOfValues) { + this.numberOfValues = numberOfValues; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("{"); @@ -306,6 +324,7 @@ public String toString() { } sb.append(", \"distinctValues\": ").append(distinctValues); sb.append(", \"nullCount\": ").append(nullCount); + sb.append(", \"numberOfValues\": ").append(numberOfValues); return sb.append('}').toString(); } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 2e64f77b8..d11762340 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -597,11 +597,13 @@ BlobMetadata buildAndUpload( InvalidKeyException { Timer.Context buildContext = Utils.createTimerContext(this.owningClient.buildLatency); - InternalParameterProvider paramProvider = this.owningClient.getInternalParameterProvider(); // Construct the blob along with the metadata of the blob BlobBuilder.Blob blob = BlobBuilder.constructBlobAndMetadata( - blobPath.fileName, blobData, bdecVersion, paramProvider); + blobPath.fileName, + blobData, + bdecVersion, + this.owningClient.getInternalParameterProvider()); blob.blobStats.setBuildDurationMs(buildContext); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/Flusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/Flusher.java index 241defdfc..5a426e873 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/Flusher.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/Flusher.java @@ -36,6 +36,7 @@ class SerializationResult { final float chunkEstimatedUncompressedSize; final ByteArrayOutputStream chunkData; final Pair chunkMinMaxInsertTimeInMs; + final long extendedMetadataSize; public SerializationResult( List channelsMetadataList, @@ -43,13 +44,15 @@ public SerializationResult( long rowCount, float chunkEstimatedUncompressedSize, ByteArrayOutputStream chunkData, - Pair chunkMinMaxInsertTimeInMs) { + Pair chunkMinMaxInsertTimeInMs, + long extendedMetadataSize) { this.channelsMetadataList = channelsMetadataList; this.columnEpStatsMapCombined = columnEpStatsMapCombined; this.rowCount = rowCount; this.chunkEstimatedUncompressedSize = chunkEstimatedUncompressedSize; this.chunkData = chunkData; this.chunkMinMaxInsertTimeInMs = chunkMinMaxInsertTimeInMs; + this.extendedMetadataSize = extendedMetadataSize; } } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParser.java b/src/main/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParser.java index 963dbf188..7c2f90710 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParser.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParser.java @@ -25,6 +25,7 @@ import net.snowflake.ingest.utils.Constants; import net.snowflake.ingest.utils.ErrorCode; import net.snowflake.ingest.utils.SFException; +import net.snowflake.ingest.utils.SubColumnFinder; import net.snowflake.ingest.utils.Utils; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; @@ -49,6 +50,7 @@ class IcebergParquetValueParser { * @param value column value provided by user in a row * @param type Parquet column type * @param statsMap column stats map to update + * @param subColumnFinder helper class to find stats of sub-columns * @param defaultTimezone default timezone to use for timestamp parsing * @param insertRowsCurrIndex Row index corresponding the row to parse (w.r.t input rows in * insertRows API, and not buffered row) @@ -58,17 +60,19 @@ static ParquetBufferValue parseColumnValueToParquet( Object value, Type type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, long insertRowsCurrIndex) { Utils.assertNotNull("Parquet column stats map", statsMap); return parseColumnValueToParquet( - value, type, statsMap, defaultTimezone, insertRowsCurrIndex, null, false); + value, type, statsMap, subColumnFinder, defaultTimezone, insertRowsCurrIndex, null, false); } private static ParquetBufferValue parseColumnValueToParquet( Object value, Type type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, long insertRowsCurrIndex, String path, @@ -152,6 +156,7 @@ private static ParquetBufferValue parseColumnValueToParquet( value, type.asGroupType(), statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, path, @@ -164,9 +169,9 @@ private static ParquetBufferValue parseColumnValueToParquet( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, path, "Passed null to non nullable field"); } - if (type.isPrimitive()) { - statsMap.get(path).incCurrentNullCount(); - } + subColumnFinder + .getSubColumns(path) + .forEach(subColumn -> statsMap.get(subColumn).incCurrentNullCount()); } return new ParquetBufferValue(value, estimatedParquetSize); @@ -366,6 +371,7 @@ private static int timeUnitToScale(LogicalTypeAnnotation.TimeUnit timeUnit) { * @param value value to parse * @param type Parquet column type * @param statsMap column stats map to update + * @param subColumnFinder helper class to find stats of sub-columns * @param defaultTimezone default timezone to use for timestamp parsing * @param insertRowsCurrIndex Used for logging the row of index given in insertRows API * @param path dot path of the column @@ -376,6 +382,7 @@ private static ParquetBufferValue getGroupValue( Object value, GroupType type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, final long insertRowsCurrIndex, String path, @@ -386,16 +393,19 @@ private static ParquetBufferValue getGroupValue( value, type, statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, path, isDescendantsOfRepeatingGroup); } if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) { - return get3LevelListValue(value, type, statsMap, defaultTimezone, insertRowsCurrIndex, path); + return get3LevelListValue( + value, type, statsMap, subColumnFinder, defaultTimezone, insertRowsCurrIndex, path); } if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation) { - return get3LevelMapValue(value, type, statsMap, defaultTimezone, insertRowsCurrIndex, path); + return get3LevelMapValue( + value, type, statsMap, subColumnFinder, defaultTimezone, insertRowsCurrIndex, path); } throw new SFException( ErrorCode.UNKNOWN_DATA_TYPE, logicalTypeAnnotation, type.getClass().getSimpleName()); @@ -410,6 +420,7 @@ private static ParquetBufferValue getStructValue( Object value, GroupType type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, final long insertRowsCurrIndex, String path, @@ -425,6 +436,7 @@ private static ParquetBufferValue getStructValue( structVal.getOrDefault(type.getFieldName(i), null), type.getType(i), statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, path, @@ -457,6 +469,7 @@ private static ParquetBufferValue get3LevelListValue( Object value, GroupType type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, final long insertRowsCurrIndex, String path) { @@ -471,6 +484,7 @@ private static ParquetBufferValue get3LevelListValue( val, type.getType(0).asGroupType().getType(0), statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, listGroupPath, @@ -492,6 +506,7 @@ private static ParquetBufferValue get3LevelMapValue( Object value, GroupType type, Map statsMap, + SubColumnFinder subColumnFinder, ZoneId defaultTimezone, final long insertRowsCurrIndex, String path) { @@ -506,6 +521,7 @@ private static ParquetBufferValue get3LevelMapValue( entry.getKey(), type.getType(0).asGroupType().getType(0), statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, mapGroupPath, @@ -515,6 +531,7 @@ private static ParquetBufferValue get3LevelMapValue( entry.getValue(), type.getType(0).asGroupType().getType(1), statsMap, + subColumnFinder, defaultTimezone, insertRowsCurrIndex, mapGroupPath, diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/InternalParameterProvider.java b/src/main/java/net/snowflake/ingest/streaming/internal/InternalParameterProvider.java index 4ab8ecc4b..d6589bf93 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/InternalParameterProvider.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/InternalParameterProvider.java @@ -7,6 +7,8 @@ /** A class to provide non-configurable constants depends on Iceberg or non-Iceberg mode */ class InternalParameterProvider { public static final Integer MAX_ROW_GROUP_COUNT_ICEBERG_MODE_DEFAULT = 1; + public static final boolean ENABLE_DISTINCT_VALUES_COUNT_DEFAULT = false; + public static final boolean ENABLE_VALUES_COUNT_DEFAULT = false; private final boolean isIcebergMode; @@ -33,4 +35,15 @@ boolean setIcebergSpecificFieldsInEp() { // in the EP metadata, createdOn, and extendedMetadataSize. return isIcebergMode; } + + boolean isEnableDistinctValuesCount() { + // When in Iceberg mode, we enabled distinct values count in EP metadata. + return isIcebergMode; + } + + boolean isEnableValuesCount() { + // When in Iceberg mode, we enabled values count in EP metadata for repeated group (e.g. map, + // list). + return isIcebergMode; + } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java index 5b11996ec..e7272d94a 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java @@ -17,7 +17,7 @@ import net.snowflake.ingest.utils.Pair; import net.snowflake.ingest.utils.SFException; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.hadoop.BdecParquetWriter; +import org.apache.parquet.hadoop.SnowflakeParquetWriter; import org.apache.parquet.schema.MessageType; /** @@ -66,7 +66,7 @@ private SerializationResult serializeFromJavaObjects( String firstChannelFullyQualifiedTableName = null; Map columnEpStatsMapCombined = null; List> rows = null; - BdecParquetWriter parquetWriter; + SnowflakeParquetWriter parquetWriter; ByteArrayOutputStream mergedData = new ByteArrayOutputStream(); Pair chunkMinMaxInsertTimeInMs = null; @@ -129,7 +129,7 @@ private SerializationResult serializeFromJavaObjects( // http://go/streams-on-replicated-mixed-tables metadata.put(Constants.PRIMARY_FILE_ID_KEY, StreamingIngestUtils.getShortname(filePath)); parquetWriter = - new BdecParquetWriter( + new SnowflakeParquetWriter( mergedData, schema, metadata, @@ -150,7 +150,8 @@ private SerializationResult serializeFromJavaObjects( rowCount, chunkEstimatedUncompressedSize, mergedData, - chunkMinMaxInsertTimeInMs); + chunkMinMaxInsertTimeInMs, + parquetWriter.getExtendedMetadataSize()); } /** @@ -164,7 +165,7 @@ private SerializationResult serializeFromJavaObjects( * Used only for logging purposes if there is a mismatch. */ private void verifyRowCounts( - BdecParquetWriter writer, + SnowflakeParquetWriter writer, long totalMetadataRowCount, List> channelsDataPerTable, long javaSerializationTotalRowCount) { diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index 5e3fa1191..96a1156ce 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -24,7 +24,9 @@ import net.snowflake.ingest.streaming.OffsetTokenVerificationFunction; import net.snowflake.ingest.streaming.OpenChannelRequest; import net.snowflake.ingest.utils.ErrorCode; +import net.snowflake.ingest.utils.IcebergDataTypeParser; import net.snowflake.ingest.utils.SFException; +import net.snowflake.ingest.utils.SubColumnFinder; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.MessageType; @@ -37,8 +39,6 @@ */ public class ParquetRowBuffer extends AbstractRowBuffer { - private final Map fieldIndex; - /* map that contains metadata like typeinfo for columns and other information needed by the server scanner */ private final Map metadata; @@ -49,6 +49,7 @@ public class ParquetRowBuffer extends AbstractRowBuffer { private final ParquetProperties.WriterVersion parquetWriterVersion; private MessageType schema; + private SubColumnFinder subColumnFinder; /** Construct a ParquetRowBuffer object. */ ParquetRowBuffer( @@ -70,7 +71,6 @@ public class ParquetRowBuffer extends AbstractRowBuffer { clientBufferParameters, offsetTokenVerificationFunction, telemetryService); - this.fieldIndex = new HashMap<>(); this.metadata = new HashMap<>(); this.data = new ArrayList<>(); this.tempData = new ArrayList<>(); @@ -113,7 +113,9 @@ public void setupSchema(List columns) { column.getCollation(), column.getOrdinal(), null /* fieldId */, - parquetType.isPrimitive() ? parquetType.asPrimitiveType() : null)); + parquetType.isPrimitive() ? parquetType.asPrimitiveType() : null, + false /* enableDistinctValuesCount */, + false /* enableValuesCount */)); if (onErrorOption == OpenChannelRequest.OnErrorOption.ABORT || onErrorOption == OpenChannelRequest.OnErrorOption.SKIP_BATCH) { @@ -128,7 +130,9 @@ public void setupSchema(List columns) { column.getCollation(), column.getOrdinal(), null /* fieldId */, - parquetType.isPrimitive() ? parquetType.asPrimitiveType() : null)); + parquetType.isPrimitive() ? parquetType.asPrimitiveType() : null, + false /* enableDistinctValuesCount */, + false /* enableValuesCount */)); } } @@ -186,30 +190,60 @@ public void setupSchema(List columns) { */ if (clientBufferParameters.getIsIcebergMode()) { for (ColumnDescriptor columnDescriptor : schema.getColumns()) { - String columnPath = concatDotPath(columnDescriptor.getPath()); + String[] path = columnDescriptor.getPath(); + String columnDotPath = concatDotPath(path); PrimitiveType primitiveType = columnDescriptor.getPrimitiveType(); + boolean isInRepeatedGroup = false; + + if (path.length > 1 + && schema + .getType(Arrays.copyOf(path, path.length - 1)) + .isRepetition(Type.Repetition.REPEATED)) { + if (!primitiveType.getName().equals(IcebergDataTypeParser.ELEMENT) + && !primitiveType.getName().equals(IcebergDataTypeParser.KEY) + && !primitiveType.getName().equals(IcebergDataTypeParser.VALUE)) { + throw new SFException( + ErrorCode.INTERNAL_ERROR, + String.format( + "Invalid repeated column %s, column name must be %s, %s or %s", + columnDotPath, + IcebergDataTypeParser.ELEMENT, + IcebergDataTypeParser.KEY, + IcebergDataTypeParser.VALUE)); + } + isInRepeatedGroup = true; + } /* set fieldId to 0 for non-structured columns */ - int fieldId = columnDescriptor.getPath().length == 1 ? 0 : primitiveType.getId().intValue(); + int fieldId = path.length == 1 ? 0 : primitiveType.getId().intValue(); int ordinal = schema.getType(columnDescriptor.getPath()[0]).getId().intValue(); this.statsMap.put( - columnPath, + columnDotPath, new RowBufferStats( - columnPath, null /* collationDefinitionString */, ordinal, fieldId, primitiveType)); + columnDotPath, + null /* collationDefinitionString */, + ordinal, + fieldId, + primitiveType, + clientBufferParameters.isEnableDistinctValuesCount(), + clientBufferParameters.isEnableValuesCount() && isInRepeatedGroup)); if (onErrorOption == OpenChannelRequest.OnErrorOption.ABORT || onErrorOption == OpenChannelRequest.OnErrorOption.SKIP_BATCH) { this.tempStatsMap.put( - columnPath, + columnDotPath, new RowBufferStats( - columnPath, + columnDotPath, null /* collationDefinitionString */, ordinal, fieldId, - primitiveType)); + primitiveType, + clientBufferParameters.isEnableDistinctValuesCount(), + clientBufferParameters.isEnableValuesCount() && isInRepeatedGroup)); } } + subColumnFinder = new SubColumnFinder(schema); } tempData.clear(); data.clear(); @@ -267,7 +301,7 @@ private float addRow( // Create new empty stats just for the current row. Map forkedStatsMap = new HashMap<>(); - statsMap.forEach((columnName, stats) -> forkedStatsMap.put(columnName, stats.forkEmpty())); + statsMap.forEach((columnPath, stats) -> forkedStatsMap.put(columnPath, stats.forkEmpty())); for (Map.Entry entry : row.entrySet()) { String key = entry.getKey(); @@ -279,7 +313,12 @@ private float addRow( ParquetBufferValue valueWithSize = (clientBufferParameters.getIsIcebergMode() ? IcebergParquetValueParser.parseColumnValueToParquet( - value, parquetColumn.type, forkedStatsMap, defaultTimezone, insertRowsCurrIndex) + value, + parquetColumn.type, + forkedStatsMap, + subColumnFinder, + defaultTimezone, + insertRowsCurrIndex) : SnowflakeParquetValueParser.parseColumnValueToParquet( value, column, @@ -313,9 +352,25 @@ private float addRow( RowBufferStats.getCombinedStats(statsMap.get(columnName), forkedColStats.getValue())); } - // Increment null count for column missing in the input map + // Increment null count for column and its sub-columns missing in the input map for (String columnName : Sets.difference(this.fieldIndex.keySet(), inputColumnNames)) { - statsMap.get(columnName).incCurrentNullCount(); + if (clientBufferParameters.getIsIcebergMode()) { + if (subColumnFinder == null) { + throw new SFException(ErrorCode.INTERNAL_ERROR, "SubColumnFinder is not initialized."); + } + + for (String subColumn : subColumnFinder.getSubColumns(columnName)) { + RowBufferStats stats = statsMap.get(subColumn); + if (stats == null) { + throw new SFException( + ErrorCode.INTERNAL_ERROR, + String.format("Column %s not found in stats map.", subColumn)); + } + stats.incCurrentNullCount(); + } + } else { + statsMap.get(columnName).incCurrentNullCount(); + } } return size; } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/RowBufferStats.java b/src/main/java/net/snowflake/ingest/streaming/internal/RowBufferStats.java index 4d7781c78..6d61c8eea 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/RowBufferStats.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/RowBufferStats.java @@ -5,10 +5,14 @@ package net.snowflake.ingest.streaming.internal; import static net.snowflake.ingest.utils.Constants.EP_NDV_UNKNOWN; +import static net.snowflake.ingest.utils.Constants.EP_NV_UNKNOWN; import java.math.BigInteger; import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; import java.util.Objects; +import java.util.Set; import net.snowflake.ingest.utils.ErrorCode; import net.snowflake.ingest.utils.SFException; import org.apache.parquet.schema.PrimitiveType; @@ -42,26 +46,50 @@ class RowBufferStats { // for binary or string columns private long currentMaxLength; + private final boolean enableDistinctValuesCount; + private Set distinctValues; + private final boolean enableValuesCount; + private long numberOfValues; + RowBufferStats( String columnDisplayName, String collationDefinitionString, int ordinal, Integer fieldId, - PrimitiveType primitiveType) { + PrimitiveType primitiveType, + boolean enableDistinctValuesCount, + boolean enableValuesCount) { this.columnDisplayName = columnDisplayName; this.collationDefinitionString = collationDefinitionString; this.ordinal = ordinal; this.fieldId = fieldId; this.primitiveType = primitiveType; + this.enableDistinctValuesCount = enableDistinctValuesCount; + this.enableValuesCount = enableValuesCount; + if (enableDistinctValuesCount) { + this.distinctValues = new HashSet<>(); + } reset(); } - RowBufferStats(String columnDisplayName) { - this(columnDisplayName, null, -1, null, null); + RowBufferStats( + String columnDisplayName, boolean enableDistinctValuesCount, boolean enableValuesCount) { + this(columnDisplayName, null, -1, null, null, enableDistinctValuesCount, enableValuesCount); } - RowBufferStats(String columnDisplayName, PrimitiveType primitiveType) { - this(columnDisplayName, null, -1, null, primitiveType); + RowBufferStats( + String columnDisplayName, + PrimitiveType primitiveType, + boolean enableDistinctValuesCount, + boolean enableValuesCount) { + this( + columnDisplayName, + null, + -1, + null, + primitiveType, + enableDistinctValuesCount, + enableValuesCount); } void reset() { @@ -73,6 +101,10 @@ void reset() { this.currentMinRealValue = null; this.currentNullCount = 0; this.currentMaxLength = 0; + if (distinctValues != null) { + distinctValues.clear(); + } + this.numberOfValues = 0L; } /** Create new statistics for the same column, with all calculated values set to empty */ @@ -82,26 +114,40 @@ RowBufferStats forkEmpty() { this.getCollationDefinitionString(), this.getOrdinal(), this.getFieldId(), - this.getPrimitiveType()); + this.getPrimitiveType(), + this.enableDistinctValuesCount, + this.enableValuesCount); } // TODO performance test this vs in place update static RowBufferStats getCombinedStats(RowBufferStats left, RowBufferStats right) { - if (!Objects.equals(left.getCollationDefinitionString(), right.collationDefinitionString)) { + if (!Objects.equals(left.getCollationDefinitionString(), right.collationDefinitionString) + || left.enableDistinctValuesCount != right.enableDistinctValuesCount + || left.enableValuesCount != right.enableValuesCount) { throw new SFException( ErrorCode.INVALID_COLLATION_STRING, - "Tried to combine stats for different collations", + "Tried to combine stats for different" + + " collations/enableDistinctValuesCount/enableValuesCount", String.format( - "left=%s, right=%s", - left.getCollationDefinitionString(), right.getCollationDefinitionString())); + "left={collations=%s, enableDistinctValuesCount=%s, enableValuesCount=%s}, " + + "right={collations=%s, enableDistinctValuesCount=%s, enableValuesCount=%s}", + left.getCollationDefinitionString(), + left.enableDistinctValuesCount, + left.enableValuesCount, + right.getCollationDefinitionString(), + right.enableDistinctValuesCount, + right.enableValuesCount)); } + RowBufferStats combined = new RowBufferStats( left.columnDisplayName, left.getCollationDefinitionString(), left.getOrdinal(), left.getFieldId(), - left.getPrimitiveType()); + left.getPrimitiveType(), + left.enableDistinctValuesCount, + left.enableValuesCount); if (left.currentMinIntValue != null) { combined.addIntValue(left.currentMinIntValue); @@ -133,6 +179,15 @@ static RowBufferStats getCombinedStats(RowBufferStats left, RowBufferStats right combined.addRealValue(right.currentMaxRealValue); } + if (combined.enableDistinctValuesCount) { + combined.distinctValues.addAll(left.distinctValues); + combined.distinctValues.addAll(right.distinctValues); + } + + if (combined.enableValuesCount) { + combined.numberOfValues = left.numberOfValues + right.numberOfValues; + } + combined.currentNullCount = left.currentNullCount + right.currentNullCount; combined.currentMaxLength = Math.max(left.currentMaxLength, right.currentMaxLength); @@ -145,7 +200,6 @@ void addStrValue(String value) { void addBinaryValue(byte[] valueBytes) { this.setCurrentMaxLength(valueBytes.length); - // Check if new min/max string if (this.currentMinStrValue == null) { this.currentMinStrValue = valueBytes; @@ -159,6 +213,13 @@ void addBinaryValue(byte[] valueBytes) { this.currentMaxStrValue = valueBytes; } } + + if (enableDistinctValuesCount) { + distinctValues.add(Arrays.hashCode(valueBytes)); + } + if (enableValuesCount) { + numberOfValues++; + } } byte[] getCurrentMinStrValue() { @@ -179,6 +240,13 @@ void addIntValue(BigInteger value) { } else if (this.currentMaxIntValue.compareTo(value) < 0) { this.currentMaxIntValue = value; } + + if (enableDistinctValuesCount) { + distinctValues.add(value); + } + if (enableValuesCount) { + numberOfValues++; + } } BigInteger getCurrentMinIntValue() { @@ -199,6 +267,13 @@ void addRealValue(Double value) { } else if (this.currentMaxRealValue.compareTo(value) < 0) { this.currentMaxRealValue = value; } + + if (enableDistinctValuesCount) { + distinctValues.add(value); + } + if (enableValuesCount) { + numberOfValues++; + } } Double getCurrentMinRealValue() { @@ -233,7 +308,11 @@ long getCurrentMaxLength() { * @return -1 indicating the NDV is unknown */ long getDistinctValues() { - return EP_NDV_UNKNOWN; + return enableDistinctValuesCount ? distinctValues.size() : EP_NDV_UNKNOWN; + } + + long getNumberOfValues() { + return enableValuesCount ? numberOfValues : EP_NV_UNKNOWN; } String getCollationDefinitionString() { diff --git a/src/main/java/net/snowflake/ingest/utils/Constants.java b/src/main/java/net/snowflake/ingest/utils/Constants.java index 7198c7669..a5e04e21e 100644 --- a/src/main/java/net/snowflake/ingest/utils/Constants.java +++ b/src/main/java/net/snowflake/ingest/utils/Constants.java @@ -62,6 +62,7 @@ public class Constants { public static final int MAX_STREAMING_INGEST_API_CHANNEL_RETRY = 3; public static final int STREAMING_INGEST_TELEMETRY_UPLOAD_INTERVAL_IN_SEC = 10; public static final long EP_NDV_UNKNOWN = -1L; + public static final long EP_NV_UNKNOWN = -1L; public static final int MAX_OAUTH_REFRESH_TOKEN_RETRY = 3; public static final int BINARY_COLUMN_MAX_SIZE = 8 * 1024 * 1024; public static final int VARCHAR_COLUMN_MAX_SIZE = 16 * 1024 * 1024; @@ -72,6 +73,7 @@ public class Constants { public static final String DROP_CHANNEL_ENDPOINT = "/v1/streaming/channels/drop/"; public static final String REGISTER_BLOB_ENDPOINT = "/v1/streaming/channels/write/blobs/"; + public static final int PARQUET_MAJOR_VERSION = 1; public static final int PARQUET_MINOR_VERSION = 0; /** diff --git a/src/main/java/net/snowflake/ingest/utils/IcebergDataTypeParser.java b/src/main/java/net/snowflake/ingest/utils/IcebergDataTypeParser.java index abb03cdef..53b4892c2 100644 --- a/src/main/java/net/snowflake/ingest/utils/IcebergDataTypeParser.java +++ b/src/main/java/net/snowflake/ingest/utils/IcebergDataTypeParser.java @@ -26,14 +26,14 @@ * /IcebergDataTypeParser.java */ public class IcebergDataTypeParser { + public static final String ELEMENT = "element"; + public static final String KEY = "key"; + public static final String VALUE = "value"; private static final String TYPE = "type"; private static final String STRUCT = "struct"; private static final String LIST = "list"; private static final String MAP = "map"; private static final String FIELDS = "fields"; - private static final String ELEMENT = "element"; - private static final String KEY = "key"; - private static final String VALUE = "value"; private static final String DOC = "doc"; private static final String NAME = "name"; private static final String ID = "id"; diff --git a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java index 3d04aa1b2..b86e59525 100644 --- a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java +++ b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java @@ -157,73 +157,85 @@ private void setParameterMap( BUFFER_FLUSH_CHECK_INTERVAL_IN_MILLIS_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( INSERT_THROTTLE_INTERVAL_IN_MILLIS, INSERT_THROTTLE_INTERVAL_IN_MILLIS_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( INSERT_THROTTLE_THRESHOLD_IN_PERCENTAGE, INSERT_THROTTLE_THRESHOLD_IN_PERCENTAGE_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( INSERT_THROTTLE_THRESHOLD_IN_BYTES, INSERT_THROTTLE_THRESHOLD_IN_BYTES_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( ENABLE_SNOWPIPE_STREAMING_METRICS, SNOWPIPE_STREAMING_METRICS_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( - BLOB_FORMAT_VERSION, BLOB_FORMAT_VERSION_DEFAULT, parameterOverrides, props, false); + BLOB_FORMAT_VERSION, + BLOB_FORMAT_VERSION_DEFAULT, + parameterOverrides, + props, + false /* enforceDefault */); getBlobFormatVersion(); // to verify parsing the configured value this.checkAndUpdate( - IO_TIME_CPU_RATIO, IO_TIME_CPU_RATIO_DEFAULT, parameterOverrides, props, false); + IO_TIME_CPU_RATIO, + IO_TIME_CPU_RATIO_DEFAULT, + parameterOverrides, + props, + false /* enforceDefault */); this.checkAndUpdate( BLOB_UPLOAD_MAX_RETRY_COUNT, BLOB_UPLOAD_MAX_RETRY_COUNT_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( MAX_MEMORY_LIMIT_IN_BYTES, MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( MAX_CHANNEL_SIZE_IN_BYTES, MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( - MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props, false); + MAX_CHUNK_SIZE_IN_BYTES, + MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, + parameterOverrides, + props, + false /* enforceDefault */); this.checkAndUpdate( MAX_CLIENT_LAG, isIcebergMode ? MAX_CLIENT_LAG_ICEBERG_MODE_DEFAULT : MAX_CLIENT_LAG_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( MAX_CHUNKS_IN_BLOB, @@ -237,21 +249,21 @@ private void setParameterMap( MAX_CHUNKS_IN_REGISTRATION_REQUEST_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( BDEC_PARQUET_COMPRESSION_ALGORITHM, BDEC_PARQUET_COMPRESSION_ALGORITHM_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); this.checkAndUpdate( ENABLE_NEW_JSON_PARSING_LOGIC, ENABLE_NEW_JSON_PARSING_LOGIC_DEFAULT, parameterOverrides, props, - false); + false /* enforceDefault */); if (getMaxChunksInBlob() > getMaxChunksInRegistrationRequest()) { throw new IllegalArgumentException( diff --git a/src/main/java/net/snowflake/ingest/utils/SubColumnFinder.java b/src/main/java/net/snowflake/ingest/utils/SubColumnFinder.java new file mode 100644 index 000000000..2b71eea27 --- /dev/null +++ b/src/main/java/net/snowflake/ingest/utils/SubColumnFinder.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + +package net.snowflake.ingest.utils; + +import static net.snowflake.ingest.utils.Utils.concatDotPath; +import static net.snowflake.ingest.utils.Utils.isNullOrEmpty; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; + +/** Helper class to find all leaf sub-columns in an immutable schema given a dot path. */ +public class SubColumnFinder { + static class SubtreeInterval { + final int startTag; + final int endTag; + + SubtreeInterval(int startTag, int endTag) { + this.startTag = startTag; + this.endTag = endTag; + } + } + + private final List list; + private final Map accessMap; + + public SubColumnFinder(MessageType schema) { + accessMap = new HashMap<>(); + list = new ArrayList<>(); + build(schema, null); + } + + public List getSubColumns(String dotPath) { + if (!accessMap.containsKey(dotPath)) { + throw new IllegalArgumentException(String.format("Column %s not found in schema", dotPath)); + } + SubtreeInterval interval = accessMap.get(dotPath); + return Collections.unmodifiableList(list.subList(interval.startTag, interval.endTag)); + } + + private void build(Type node, String dotPath) { + if (dotPath == null) { + /* Ignore root node type name (bdec or schema) */ + dotPath = ""; + } else if (dotPath.isEmpty()) { + dotPath = node.getName(); + } else { + dotPath = concatDotPath(dotPath, node.getName()); + } + + int startTag = list.size(); + if (!node.isPrimitive()) { + for (Type child : node.asGroupType().getFields()) { + build(child, dotPath); + } + } else { + list.add(dotPath); + } + if (!isNullOrEmpty(dotPath)) { + accessMap.put(dotPath, new SubtreeInterval(startTag, list.size())); + } + } +} diff --git a/src/main/java/net/snowflake/ingest/utils/Utils.java b/src/main/java/net/snowflake/ingest/utils/Utils.java index 95d941036..ca71a8233 100644 --- a/src/main/java/net/snowflake/ingest/utils/Utils.java +++ b/src/main/java/net/snowflake/ingest/utils/Utils.java @@ -8,6 +8,7 @@ import com.codahale.metrics.Timer; import io.netty.util.internal.PlatformDependent; +import java.io.IOException; import java.io.StringReader; import java.lang.management.BufferPoolMXBean; import java.lang.management.ManagementFactory; @@ -29,6 +30,8 @@ import java.util.Properties; import net.snowflake.client.core.SFSessionProperty; import org.apache.commons.codec.binary.Base64; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.hadoop.ParquetFileWriter; import org.bouncycastle.asn1.pkcs.PrivateKeyInfo; import org.bouncycastle.openssl.PEMParser; import org.bouncycastle.openssl.jcajce.JcaPEMKeyConverter; @@ -412,7 +415,7 @@ public static String getFullyQualifiedChannelName( return String.format("%s.%s.%s.%s", dbName, schemaName, tableName, channelName); } - /* + /** * Get concat dot path, check if any path is empty or null * * @param path the path @@ -430,4 +433,31 @@ public static String concatDotPath(String... path) { } return sb.toString(); } + + /** + * Get the footer size (metadata size) of a parquet file + * + * @param bytes the serialized parquet file + * @return the footer size + */ + public static long getParquetFooterSize(byte[] bytes) throws IOException { + final int magicOffset = bytes.length - ParquetFileWriter.MAGIC.length; + final int footerSizeOffset = magicOffset - Integer.BYTES; + + if (footerSizeOffset < 0) { + throw new IllegalArgumentException( + String.format("Invalid parquet file. File too small, file length=%s.", bytes.length)); + } + + String fileMagic = new String(bytes, magicOffset, ParquetFileWriter.MAGIC.length); + if (!ParquetFileWriter.MAGIC_STR.equals(fileMagic) + && !ParquetFileWriter.EF_MAGIC_STR.equals(fileMagic)) { + throw new IllegalArgumentException( + String.format( + "Invalid parquet file. Bad parquet magic, expected=[%s | %s], actual=%s.", + ParquetFileWriter.MAGIC_STR, ParquetFileWriter.EF_MAGIC_STR, fileMagic)); + } + + return BytesUtils.readIntLittleEndian(bytes, footerSizeOffset); + } } diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetReader.java b/src/main/java/org/apache/parquet/hadoop/BdecParquetReader.java index ef95fab14..b1351be84 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetReader.java +++ b/src/main/java/org/apache/parquet/hadoop/BdecParquetReader.java @@ -1,9 +1,10 @@ /* - * Copyright (c) 2022 Snowflake Computing Inc. All rights reserved. + * Copyright (c) 2022-2024 Snowflake Computing Inc. All rights reserved. */ package org.apache.parquet.hadoop; +import com.google.common.annotations.VisibleForTesting; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Arrays; @@ -82,7 +83,7 @@ public void close() throws IOException { * @param data input data to be read first and then written with outputWriter * @param outputWriter output parquet writer */ - public static void readFileIntoWriter(byte[] data, BdecParquetWriter outputWriter) { + public static void readFileIntoWriter(byte[] data, SnowflakeParquetWriter outputWriter) { try (BdecParquetReader reader = new BdecParquetReader(data)) { for (List record = reader.read(); record != null; record = reader.read()) { outputWriter.writeRow(record); @@ -92,10 +93,11 @@ public static void readFileIntoWriter(byte[] data, BdecParquetWriter outputWrite } } - private static class BdecInputFile implements InputFile { + @VisibleForTesting + public static class BdecInputFile implements InputFile { private final byte[] data; - private BdecInputFile(byte[] data) { + public BdecInputFile(byte[] data) { this.data = data; } diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java b/src/main/java/org/apache/parquet/hadoop/SnowflakeParquetWriter.java similarity index 91% rename from src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java rename to src/main/java/org/apache/parquet/hadoop/SnowflakeParquetWriter.java index c3126847d..c3d18cae5 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java +++ b/src/main/java/org/apache/parquet/hadoop/SnowflakeParquetWriter.java @@ -19,6 +19,7 @@ import org.apache.parquet.crypto.FileEncryptionProperties; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.io.DelegatingPositionOutputStream; import org.apache.parquet.io.OutputFile; import org.apache.parquet.io.ParquetEncodingException; @@ -31,12 +32,13 @@ import org.apache.parquet.schema.Type; /** - * BDEC specific parquet writer. + * Snowflake specific parquet writer, supports BDEC file for FDN tables and parquet file for Iceberg + * tables. * *

Resides in parquet package because, it uses {@link InternalParquetRecordWriter} and {@link * CodecFactory} that are package private. */ -public class BdecParquetWriter implements AutoCloseable { +public class SnowflakeParquetWriter implements AutoCloseable { private final InternalParquetRecordWriter> writer; private final CodecFactory codecFactory; @@ -50,7 +52,7 @@ public class BdecParquetWriter implements AutoCloseable { private long rowsWritten = 0; /** - * Creates a BDEC specific parquet writer. + * Creates a Snowflake specific parquet writer. * * @param stream output * @param schema row schema @@ -60,7 +62,7 @@ public class BdecParquetWriter implements AutoCloseable { * exceeded we'll end up throwing * @throws IOException */ - public BdecParquetWriter( + public SnowflakeParquetWriter( ByteArrayOutputStream stream, MessageType schema, Map extraMetaData, @@ -78,7 +80,7 @@ public BdecParquetWriter( ParquetProperties encodingProps = createParquetProperties(); Configuration conf = new Configuration(); WriteSupport> writeSupport = - new BdecWriteSupport(schema, extraMetaData, channelName); + new SnowflakeWriteSupport(schema, extraMetaData, channelName); WriteSupport.WriteContext writeContext = writeSupport.init(conf); ParquetFileWriter fileWriter = @@ -138,6 +140,24 @@ public List getRowCountsFromFooter() { return blockRowCounts; } + /** @return extended metadata size (page index size + bloom filter size) */ + public long getExtendedMetadataSize() { + long extendedMetadataSize = 0; + for (BlockMetaData metadata : writer.getFooter().getBlocks()) { + for (ColumnChunkMetaData column : metadata.getColumns()) { + extendedMetadataSize += + (column.getColumnIndexReference() != null + ? column.getColumnIndexReference().getLength() + : 0) + + (column.getOffsetIndexReference() != null + ? column.getOffsetIndexReference().getLength() + : 0) + + (column.getBloomFilterLength() == -1 ? 0 : column.getBloomFilterLength()); + } + } + return extendedMetadataSize; + } + public void writeRow(List row) { try { writer.write(row); @@ -263,16 +283,17 @@ public long getPos() { * *

This class is implemented as parquet library API requires, mostly to serialize user column * values depending on type into Parquet {@link RecordConsumer} in {@link - * BdecWriteSupport#write(List)}. + * SnowflakeWriteSupport#write(List)}. */ - private static class BdecWriteSupport extends WriteSupport> { + private static class SnowflakeWriteSupport extends WriteSupport> { MessageType schema; RecordConsumer recordConsumer; Map extraMetadata; private final String channelName; // TODO SNOW-672156: support specifying encodings and compression - BdecWriteSupport(MessageType schema, Map extraMetadata, String channelName) { + SnowflakeWriteSupport( + MessageType schema, Map extraMetadata, String channelName) { this.schema = schema; this.extraMetadata = extraMetadata; this.channelName = channelName; diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/BlobBuilderTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/BlobBuilderTest.java index 04a740272..9fc585483 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/BlobBuilderTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/BlobBuilderTest.java @@ -4,6 +4,7 @@ package net.snowflake.ingest.streaming.internal; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Collections; @@ -15,11 +16,20 @@ import net.snowflake.ingest.utils.Pair; import net.snowflake.ingest.utils.SFException; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.hadoop.BdecParquetWriter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.SnowflakeParquetWriter; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Types; +import org.assertj.core.api.Assertions; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -63,6 +73,51 @@ public void testSerializationErrors() throws Exception { } } + @Test + public void testMetadataAndExtendedMetadataSize() throws Exception { + if (!isIceberg) { + return; + } + + BlobBuilder.Blob blob = + BlobBuilder.constructBlobAndMetadata( + "a.parquet", + Collections.singletonList(createChannelDataPerTable(1)), + Constants.BdecVersion.THREE, + new InternalParameterProvider(isIceberg)); + + InputFile blobInputFile = new InMemoryInputFile(blob.blobBytes); + ParquetFileReader reader = ParquetFileReader.open(blobInputFile); + ParquetMetadata footer = reader.getFooter(); + + int extendedMetadataSize = 0; + long extendedMetadaOffset = 0; + for (BlockMetaData block : footer.getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + extendedMetadataSize += + (column.getColumnIndexReference() != null + ? column.getColumnIndexReference().getLength() + : 0) + + (column.getOffsetIndexReference() != null + ? column.getOffsetIndexReference().getLength() + : 0) + + (column.getBloomFilterLength() == -1 ? 0 : column.getBloomFilterLength()); + extendedMetadaOffset = + Math.max(column.getFirstDataPageOffset() + column.getTotalSize(), extendedMetadaOffset); + } + } + Assertions.assertThat(blob.chunksMetadataList.size()).isEqualTo(1); + Assertions.assertThat(blob.chunksMetadataList.get(0).getExtendedMetadataSize()) + .isEqualTo(extendedMetadataSize); + Assertions.assertThat(blob.chunksMetadataList.get(0).getMetadataSize()) + .isEqualTo( + blob.blobBytes.length + - extendedMetadaOffset + - extendedMetadataSize + - ParquetFileWriter.MAGIC.length + - Integer.BYTES); + } + /** * Creates a channel data configurable number of rows in metadata and 1 physical row (using both * with and without internal buffering optimization) @@ -87,8 +142,8 @@ private List> createChannelDataPerTable(int metada channelData.setRowSequencer(1L); ByteArrayOutputStream stream = new ByteArrayOutputStream(); - BdecParquetWriter bdecParquetWriter = - new BdecParquetWriter( + SnowflakeParquetWriter snowflakeParquetWriter = + new SnowflakeParquetWriter( stream, schema, new HashMap<>(), @@ -100,7 +155,7 @@ private List> createChannelDataPerTable(int metada ? ParquetProperties.WriterVersion.PARQUET_2_0 : ParquetProperties.WriterVersion.PARQUET_1_0, isIceberg); - bdecParquetWriter.writeRow(Collections.singletonList("1")); + snowflakeParquetWriter.writeRow(Collections.singletonList("1")); channelData.setVectors( new ParquetChunkData( Collections.singletonList(Collections.singletonList("A")), new HashMap<>())); @@ -121,8 +176,10 @@ private List> createChannelDataPerTable(int metada Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .id(1) - .named("test")) - : new RowBufferStats(columnName, null, 1, null, null)); + .named("test"), + isIceberg, + isIceberg) + : new RowBufferStats(columnName, null, 1, null, null, false, false)); channelData.setChannelContext( new ChannelFlushContext("channel1", "DB", "SCHEMA", "TABLE", 1L, "enc", 1L)); return Collections.singletonList(channelData); @@ -131,7 +188,7 @@ private List> createChannelDataPerTable(int metada private static MessageType createSchema(String columnName) { ParquetTypeInfo c1 = ParquetTypeGenerator.generateColumnParquetTypeInfo(createTestTextColumn(columnName), 1); - return new MessageType("bdec", Collections.singletonList(c1.getParquetType())); + return new MessageType("InMemory", Collections.singletonList(c1.getParquetType())); } private static ColumnMetadata createTestTextColumn(String name) { @@ -146,4 +203,55 @@ private static ColumnMetadata createTestTextColumn(String name) { colChar.setScale(0); return colChar; } + + static class InMemoryInputFile implements InputFile { + private final byte[] data; + + public InMemoryInputFile(byte[] data) { + this.data = data; + } + + @Override + public long getLength() { + return data.length; + } + + @Override + public SeekableInputStream newStream() { + return new InMemorySeekableInputStream(new InMemoryByteArrayInputStream(data)); + } + } + + private static class InMemorySeekableInputStream extends DelegatingSeekableInputStream { + private final InMemoryByteArrayInputStream stream; + + public InMemorySeekableInputStream(InMemoryByteArrayInputStream stream) { + super(stream); + this.stream = stream; + } + + @Override + public long getPos() { + return stream.getPos(); + } + + @Override + public void seek(long newPos) { + stream.seek(newPos); + } + } + + private static class InMemoryByteArrayInputStream extends ByteArrayInputStream { + public InMemoryByteArrayInputStream(byte[] buf) { + super(buf); + } + + long getPos() { + return pos; + } + + void seek(long newPos) { + pos = (int) newPos; + } + } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/ChannelDataTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/ChannelDataTest.java index 5b545d697..d76499c14 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/ChannelDataTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/ChannelDataTest.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal; import java.math.BigInteger; @@ -8,13 +12,22 @@ import net.snowflake.ingest.utils.SFException; import org.junit.Assert; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +@RunWith(Parameterized.class) public class ChannelDataTest { + @Parameterized.Parameters(name = "enableNDVAndNV: {0}") + public static Object[] enableNDVAndNV() { + return new Object[] {false, true}; + } + + @Parameterized.Parameter public boolean enableNDVAndNV; @Test public void testGetCombinedColumnStatsMapNulls() { Map left = new HashMap<>(); - RowBufferStats leftStats1 = new RowBufferStats("COL1"); + RowBufferStats leftStats1 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); left.put("one", leftStats1); leftStats1.addIntValue(new BigInteger("10")); @@ -43,12 +56,12 @@ public void testGetCombinedColumnStatsMapNulls() { @Test public void testGetCombinedColumnStatsMapMissingColumn() { Map left = new HashMap<>(); - RowBufferStats leftStats1 = new RowBufferStats("COL1"); + RowBufferStats leftStats1 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); left.put("one", leftStats1); leftStats1.addIntValue(new BigInteger("10")); Map right = new HashMap<>(); - RowBufferStats rightStats1 = new RowBufferStats("COL1"); + RowBufferStats rightStats1 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); right.put("foo", rightStats1); rightStats1.addIntValue(new BigInteger("11")); @@ -78,10 +91,10 @@ public void testGetCombinedColumnStatsMap() { Map left = new HashMap<>(); Map right = new HashMap<>(); - RowBufferStats leftStats1 = new RowBufferStats("COL1"); - RowBufferStats rightStats1 = new RowBufferStats("COL1"); - RowBufferStats leftStats2 = new RowBufferStats("COL1"); - RowBufferStats rightStats2 = new RowBufferStats("COL1"); + RowBufferStats leftStats1 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + RowBufferStats rightStats1 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + RowBufferStats leftStats2 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + RowBufferStats rightStats2 = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); left.put("one", leftStats1); left.put("two", leftStats2); @@ -107,20 +120,24 @@ public void testGetCombinedColumnStatsMap() { Assert.assertEquals(new BigInteger("10"), oneCombined.getCurrentMinIntValue()); Assert.assertEquals(new BigInteger("17"), oneCombined.getCurrentMaxIntValue()); - Assert.assertEquals(-1, oneCombined.getDistinctValues()); Assert.assertNull(oneCombined.getCurrentMinStrValue()); Assert.assertNull(oneCombined.getCurrentMaxStrValue()); Assert.assertNull(oneCombined.getCurrentMinRealValue()); Assert.assertNull(oneCombined.getCurrentMaxRealValue()); + Assert.assertEquals(enableNDVAndNV ? 5 : -1, oneCombined.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 5 : -1, oneCombined.getNumberOfValues()); + Assert.assertArrayEquals( "10".getBytes(StandardCharsets.UTF_8), twoCombined.getCurrentMinStrValue()); Assert.assertArrayEquals( "17".getBytes(StandardCharsets.UTF_8), twoCombined.getCurrentMaxStrValue()); - Assert.assertEquals(-1, twoCombined.getDistinctValues()); Assert.assertNull(twoCombined.getCurrentMinIntValue()); Assert.assertNull(twoCombined.getCurrentMaxIntValue()); Assert.assertNull(twoCombined.getCurrentMinRealValue()); Assert.assertNull(twoCombined.getCurrentMaxRealValue()); + + Assert.assertEquals(enableNDVAndNV ? 5 : -1, oneCombined.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 5 : -1, oneCombined.getNumberOfValues()); } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FileColumnPropertiesTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FileColumnPropertiesTest.java index f4ffa11c4..0818acfea 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FileColumnPropertiesTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FileColumnPropertiesTest.java @@ -32,8 +32,10 @@ public void testFileColumnPropertiesConstructor() { Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .id(1) - .named("test")) - : new RowBufferStats("COL", null, 1, null, null); + .named("test"), + isIceberg, + isIceberg) + : new RowBufferStats("COL", null, 1, null, null, false, false); stats.addStrValue("bcd"); stats.addStrValue("abcde"); FileColumnProperties props = new FileColumnProperties(stats, !isIceberg); @@ -55,8 +57,10 @@ public void testFileColumnPropertiesConstructor() { Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .id(1) - .named("test")) - : new RowBufferStats("COL", null, 1, null, null); + .named("test"), + isIceberg, + isIceberg) + : new RowBufferStats("COL", null, 1, null, null, false, false); stats.addStrValue("aßßßßßßßßßßßßßßßß"); Assert.assertEquals(33, stats.getCurrentMinStrValue().length); props = new FileColumnProperties(stats, !isIceberg); diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index a7e4ba35b..2a1a3d97b 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -884,10 +884,16 @@ public void testBuildAndUpload() throws Exception { RowBufferStats stats1 = new RowBufferStats( - "COL1", Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1")); + "COL1", + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"), + isIcebergMode, + isIcebergMode); RowBufferStats stats2 = new RowBufferStats( - "COL1", Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1")); + "COL1", + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"), + isIcebergMode, + isIcebergMode); eps1.put("one", stats1); eps2.put("one", stats2); @@ -919,7 +925,7 @@ public void testBuildAndUpload() throws Exception { EpInfo expectedChunkEpInfo = AbstractRowBuffer.buildEpInfoFromStats( - 3, ChannelData.getCombinedColumnStatsMap(eps1, eps2), !isIcebergMode); + 3, ChannelData.getCombinedColumnStatsMap(eps1, eps2), !isIcebergMode, isIcebergMode); ChannelMetadata expectedChannel1Metadata = ChannelMetadata.builder() @@ -1049,8 +1055,11 @@ public void testInvalidateChannels() { Mockito.mock(SnowflakeStreamingIngestClientInternal.class); ParameterProvider parameterProvider = new ParameterProvider(isIcebergMode); ChannelCache channelCache = new ChannelCache<>(); + InternalParameterProvider internalParameterProvider = + new InternalParameterProvider(isIcebergMode); Mockito.when(client.getChannelCache()).thenReturn(channelCache); Mockito.when(client.getParameterProvider()).thenReturn(parameterProvider); + Mockito.when(client.getInternalParameterProvider()).thenReturn(internalParameterProvider); SnowflakeStreamingIngestChannelInternal channel1 = new SnowflakeStreamingIngestChannelInternal<>( "channel1", @@ -1134,13 +1143,16 @@ public void testBlobBuilder() throws Exception { RowBufferStats stats1 = new RowBufferStats( - "COL1", Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1")); + "COL1", + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"), + isIcebergMode, + isIcebergMode); eps1.put("one", stats1); stats1.addIntValue(new BigInteger("10")); stats1.addIntValue(new BigInteger("15")); - EpInfo epInfo = AbstractRowBuffer.buildEpInfoFromStats(2, eps1, !isIcebergMode); + EpInfo epInfo = AbstractRowBuffer.buildEpInfoFromStats(2, eps1, !isIcebergMode, isIcebergMode); ChannelMetadata channelMetadata = ChannelMetadata.builder() diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParserTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParserTest.java index 007dc3e23..279e100fd 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParserTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/IcebergParquetValueParserTest.java @@ -21,32 +21,46 @@ import java.util.Map; import net.snowflake.ingest.utils.Pair; import net.snowflake.ingest.utils.SFException; +import net.snowflake.ingest.utils.SubColumnFinder; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Type.Repetition; import org.apache.parquet.schema.Types; import org.junit.Assert; +import org.junit.Before; import org.junit.Test; +import org.mockito.Mockito; public class IcebergParquetValueParserTest { - static ObjectMapper objectMapper = new ObjectMapper(); + static ObjectMapper objectMapper; + static SubColumnFinder mockSubColumnFinder; + + @Before + public void setUp() { + objectMapper = new ObjectMapper(); + mockSubColumnFinder = Mockito.mock(SubColumnFinder.class); + Mockito.when(mockSubColumnFinder.getSubColumns(Mockito.anyString())) + .thenReturn(Collections.emptyList()); + } @Test public void parseValueBoolean() { Type type = Types.primitive(PrimitiveTypeName.BOOLEAN, Repetition.OPTIONAL).named("BOOLEAN_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("BOOLEAN_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("BOOLEAN_COL", true, true); Map rowBufferStatsMap = new HashMap() { { put("BOOLEAN_COL", rowBufferStats); } }; + ParquetBufferValue pv = - IcebergParquetValueParser.parseColumnValueToParquet(true, type, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + true, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -61,7 +75,7 @@ public void parseValueBoolean() { public void parseValueInt() { Type type = Types.primitive(PrimitiveTypeName.INT32, Repetition.OPTIONAL).named("INT_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("INT_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("INT_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -70,7 +84,7 @@ public void parseValueInt() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - Integer.MAX_VALUE, type, rowBufferStatsMap, UTC, 0); + Integer.MAX_VALUE, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -88,7 +102,7 @@ public void parseValueDecimalToInt() { .as(LogicalTypeAnnotation.decimalType(4, 9)) .named("DECIMAL_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("DECIMAL_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("DECIMAL_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -97,7 +111,7 @@ public void parseValueDecimalToInt() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - new BigDecimal("12345.6789"), type, rowBufferStatsMap, UTC, 0); + new BigDecimal("12345.6789"), type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -115,7 +129,7 @@ public void parseValueDateToInt() { .as(LogicalTypeAnnotation.dateType()) .named("DATE_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("DATE_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("DATE_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -124,7 +138,7 @@ public void parseValueDateToInt() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - "2024-01-01", type, rowBufferStatsMap, UTC, 0); + "2024-01-01", type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -139,7 +153,7 @@ public void parseValueDateToInt() { public void parseValueLong() { Type type = Types.primitive(PrimitiveTypeName.INT64, Repetition.OPTIONAL).named("LONG_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("LONG_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("LONG_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -148,7 +162,7 @@ public void parseValueLong() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - Long.MAX_VALUE, type, rowBufferStatsMap, UTC, 0); + Long.MAX_VALUE, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -166,7 +180,7 @@ public void parseValueDecimalToLong() { .as(LogicalTypeAnnotation.decimalType(9, 18)) .named("DECIMAL_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("DECIMAL_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("DECIMAL_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -175,7 +189,12 @@ public void parseValueDecimalToLong() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - new BigDecimal("123456789.123456789"), type, rowBufferStatsMap, UTC, 0); + new BigDecimal("123456789.123456789"), + type, + rowBufferStatsMap, + mockSubColumnFinder, + UTC, + 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -193,7 +212,7 @@ public void parseValueTimeToLong() { .as(LogicalTypeAnnotation.timeType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) .named("TIME_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("TIME_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("TIME_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -202,7 +221,7 @@ public void parseValueTimeToLong() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - "12:34:56.789", type, rowBufferStatsMap, UTC, 0); + "12:34:56.789", type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -220,7 +239,7 @@ public void parseValueTimestampToLong() { .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) .named("TIMESTAMP_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("TIMESTAMP_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("TIMESTAMP_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -229,7 +248,7 @@ public void parseValueTimestampToLong() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - "2024-01-01T12:34:56.789+08:00", type, rowBufferStatsMap, UTC, 0); + "2024-01-01T12:34:56.789+08:00", type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -247,7 +266,7 @@ public void parseValueTimestampTZToLong() { .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) .named("TIMESTAMP_TZ_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("TIMESTAMP_TZ_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("TIMESTAMP_TZ_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -256,7 +275,7 @@ public void parseValueTimestampTZToLong() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - "2024-01-01T12:34:56.789+08:00", type, rowBufferStatsMap, UTC, 0); + "2024-01-01T12:34:56.789+08:00", type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -271,7 +290,7 @@ public void parseValueTimestampTZToLong() { public void parseValueFloat() { Type type = Types.primitive(PrimitiveTypeName.FLOAT, Repetition.OPTIONAL).named("FLOAT_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("FLOAT_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("FLOAT_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -280,7 +299,7 @@ public void parseValueFloat() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - Float.MAX_VALUE, type, rowBufferStatsMap, UTC, 0); + Float.MAX_VALUE, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -295,7 +314,7 @@ public void parseValueFloat() { public void parseValueDouble() { Type type = Types.primitive(PrimitiveTypeName.DOUBLE, Repetition.OPTIONAL).named("DOUBLE_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("DOUBLE_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("DOUBLE_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -304,7 +323,7 @@ public void parseValueDouble() { }; ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - Double.MAX_VALUE, type, rowBufferStatsMap, UTC, 0); + Double.MAX_VALUE, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -319,7 +338,7 @@ public void parseValueDouble() { public void parseValueBinary() { Type type = Types.primitive(PrimitiveTypeName.BINARY, Repetition.OPTIONAL).named("BINARY_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("BINARY_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("BINARY_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -328,7 +347,8 @@ public void parseValueBinary() { }; byte[] value = "snowflake_to_the_moon".getBytes(); ParquetBufferValue pv = - IcebergParquetValueParser.parseColumnValueToParquet(value, type, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + value, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -347,7 +367,7 @@ public void parseValueStringToBinary() { .as(LogicalTypeAnnotation.stringType()) .named("BINARY_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("BINARY_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("BINARY_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -356,7 +376,8 @@ public void parseValueStringToBinary() { }; String value = "snowflake_to_the_moon"; ParquetBufferValue pv = - IcebergParquetValueParser.parseColumnValueToParquet(value, type, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + value, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -377,7 +398,7 @@ public void parseValueFixed() { .length(4) .named("FIXED_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("FIXED_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("FIXED_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -386,7 +407,8 @@ public void parseValueFixed() { }; byte[] value = "snow".getBytes(); ParquetBufferValue pv = - IcebergParquetValueParser.parseColumnValueToParquet(value, type, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + value, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -406,7 +428,7 @@ public void parseValueDecimalToFixed() { .as(LogicalTypeAnnotation.decimalType(10, 20)) .named("FIXED_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("FIXED_COL"); + RowBufferStats rowBufferStats = new RowBufferStats("FIXED_COL", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -415,7 +437,8 @@ public void parseValueDecimalToFixed() { }; BigDecimal value = new BigDecimal("1234567890.0123456789"); ParquetBufferValue pv = - IcebergParquetValueParser.parseColumnValueToParquet(value, type, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + value, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .rowBufferStats(rowBufferStats) @@ -433,7 +456,7 @@ public void parseList() throws JsonProcessingException { Types.optionalList() .element(Types.optional(PrimitiveTypeName.INT32).named("element")) .named("LIST_COL"); - RowBufferStats rowBufferStats = new RowBufferStats("LIST_COL.list.element"); + RowBufferStats rowBufferStats = new RowBufferStats("LIST_COL.list.element", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -441,10 +464,11 @@ public void parseList() throws JsonProcessingException { } }; - IcebergParquetValueParser.parseColumnValueToParquet(null, list, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + null, list, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - Arrays.asList(1, 2, 3, 4, 5), list, rowBufferStatsMap, UTC, 0); + Arrays.asList(1, 2, 3, 4, 5), list, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .rowBufferStats(rowBufferStats) .parquetBufferValue(pv) @@ -467,10 +491,10 @@ public void parseList() throws JsonProcessingException { SFException.class, () -> IcebergParquetValueParser.parseColumnValueToParquet( - null, requiredList, rowBufferStatsMap, UTC, 0)); + null, requiredList, rowBufferStatsMap, mockSubColumnFinder, UTC, 0)); pv = IcebergParquetValueParser.parseColumnValueToParquet( - new ArrayList<>(), requiredList, rowBufferStatsMap, UTC, 0); + new ArrayList<>(), requiredList, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .rowBufferStats(rowBufferStats) .parquetBufferValue(pv) @@ -490,7 +514,12 @@ public void parseList() throws JsonProcessingException { SFException.class, () -> IcebergParquetValueParser.parseColumnValueToParquet( - Collections.singletonList(null), requiredElements, rowBufferStatsMap, UTC, 0)); + Collections.singletonList(null), + requiredElements, + rowBufferStatsMap, + mockSubColumnFinder, + UTC, + 0)); } @Test @@ -500,8 +529,8 @@ public void parseMap() throws JsonProcessingException { .key(Types.required(PrimitiveTypeName.INT32).named("key")) .value(Types.optional(PrimitiveTypeName.INT32).named("value")) .named("MAP_COL"); - RowBufferStats rowBufferKeyStats = new RowBufferStats("MAP_COL.key_value.key"); - RowBufferStats rowBufferValueStats = new RowBufferStats("MAP_COL.key_value.value"); + RowBufferStats rowBufferKeyStats = new RowBufferStats("MAP_COL.key_value.key", true, true); + RowBufferStats rowBufferValueStats = new RowBufferStats("MAP_COL.key_value.value", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -509,7 +538,8 @@ public void parseMap() throws JsonProcessingException { put("MAP_COL.key_value.value", rowBufferValueStats); } }; - IcebergParquetValueParser.parseColumnValueToParquet(null, map, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + null, map, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( new java.util.HashMap() { @@ -520,6 +550,7 @@ public void parseMap() throws JsonProcessingException { }, map, rowBufferStatsMap, + mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() @@ -546,10 +577,15 @@ public void parseMap() throws JsonProcessingException { SFException.class, () -> IcebergParquetValueParser.parseColumnValueToParquet( - null, requiredMap, rowBufferStatsMap, UTC, 0)); + null, requiredMap, rowBufferStatsMap, mockSubColumnFinder, UTC, 0)); pv = IcebergParquetValueParser.parseColumnValueToParquet( - new java.util.HashMap(), requiredMap, rowBufferStatsMap, UTC, 0); + new java.util.HashMap(), + requiredMap, + rowBufferStatsMap, + mockSubColumnFinder, + UTC, + 0); ParquetValueParserAssertionBuilder.newBuilder() .rowBufferStats(rowBufferKeyStats) .parquetBufferValue(pv) @@ -577,6 +613,7 @@ public void parseMap() throws JsonProcessingException { }, requiredValues, rowBufferStatsMap, + mockSubColumnFinder, UTC, 0)); } @@ -592,8 +629,8 @@ public void parseStruct() throws JsonProcessingException { .named("b")) .named("STRUCT_COL"); - RowBufferStats rowBufferAStats = new RowBufferStats("STRUCT_COL.a"); - RowBufferStats rowBufferBStats = new RowBufferStats("STRUCT_COL.b"); + RowBufferStats rowBufferAStats = new RowBufferStats("STRUCT_COL.a", true, true); + RowBufferStats rowBufferBStats = new RowBufferStats("STRUCT_COL.b", true, true); Map rowBufferStatsMap = new HashMap() { { @@ -602,7 +639,8 @@ public void parseStruct() throws JsonProcessingException { } }; - IcebergParquetValueParser.parseColumnValueToParquet(null, struct, rowBufferStatsMap, UTC, 0); + IcebergParquetValueParser.parseColumnValueToParquet( + null, struct, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); Assert.assertThrows( SFException.class, () -> @@ -614,6 +652,7 @@ public void parseStruct() throws JsonProcessingException { }, struct, rowBufferStatsMap, + mockSubColumnFinder, UTC, 0)); Assert.assertThrows( @@ -627,6 +666,7 @@ public void parseStruct() throws JsonProcessingException { }, struct, rowBufferStatsMap, + mockSubColumnFinder, UTC, 0)); ParquetBufferValue pv = @@ -640,6 +680,7 @@ public void parseStruct() throws JsonProcessingException { }), struct, rowBufferStatsMap, + mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() @@ -664,10 +705,15 @@ public void parseStruct() throws JsonProcessingException { SFException.class, () -> IcebergParquetValueParser.parseColumnValueToParquet( - null, requiredStruct, rowBufferStatsMap, UTC, 0)); + null, requiredStruct, rowBufferStatsMap, mockSubColumnFinder, UTC, 0)); pv = IcebergParquetValueParser.parseColumnValueToParquet( - new java.util.HashMap(), requiredStruct, rowBufferStatsMap, UTC, 0); + new java.util.HashMap(), + requiredStruct, + rowBufferStatsMap, + mockSubColumnFinder, + UTC, + 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .expectedValueClass(ArrayList.class) @@ -688,7 +734,7 @@ public void parseNestedTypes() { List reference = (List) res.getSecond(); ParquetBufferValue pv = IcebergParquetValueParser.parseColumnValueToParquet( - value, type, rowBufferStatsMap, UTC, 0); + value, type, rowBufferStatsMap, mockSubColumnFinder, UTC, 0); ParquetValueParserAssertionBuilder.newBuilder() .parquetBufferValue(pv) .expectedValueClass(ArrayList.class) @@ -703,7 +749,7 @@ public void parseNestedTypes() { private static Type generateNestedTypeAndStats( int depth, String name, Map rowBufferStatsMap, String path) { if (depth == 0) { - rowBufferStatsMap.put(path, new RowBufferStats(path)); + rowBufferStatsMap.put(path, new RowBufferStats(path, true, true)); return Types.optional(PrimitiveTypeName.INT32).named(name); } switch (depth % 3) { @@ -718,7 +764,8 @@ private static Type generateNestedTypeAndStats( .addField(generateNestedTypeAndStats(depth - 1, "a", rowBufferStatsMap, path + ".a")) .named(name); case 0: - rowBufferStatsMap.put(path + ".key_value.key", new RowBufferStats(path + ".key_value.key")); + rowBufferStatsMap.put( + path + ".key_value.key", new RowBufferStats(path + ".key_value.key", true, true)); return Types.optionalMap() .key(Types.required(PrimitiveTypeName.INT32).named("key")) .value( diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferStatsTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferStatsTest.java index 9f2e848f3..a00cb538e 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferStatsTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferStatsTest.java @@ -1,15 +1,28 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +@RunWith(Parameterized.class) public class RowBufferStatsTest { + @Parameterized.Parameters(name = "enableNDVAndNV: {0}") + public static Object[] enableNDVAndNV() { + return new Object[] {false, true}; + } + + @Parameterized.Parameter public static boolean enableNDVAndNV; @Test public void testEmptyState() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); Assert.assertNull(stats.getCollationDefinitionString()); Assert.assertNull(stats.getCurrentMinRealValue()); @@ -18,32 +31,36 @@ public void testEmptyState() throws Exception { Assert.assertNull(stats.getCurrentMaxStrValue()); Assert.assertNull(stats.getCurrentMinIntValue()); Assert.assertNull(stats.getCurrentMaxIntValue()); - Assert.assertEquals(0, stats.getCurrentNullCount()); - Assert.assertEquals(-1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 0 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 0 : -1, stats.getNumberOfValues()); } @Test public void testMinMaxStrNonCol() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); stats.addStrValue("bob"); Assert.assertArrayEquals("bob".getBytes(StandardCharsets.UTF_8), stats.getCurrentMinStrValue()); Assert.assertArrayEquals("bob".getBytes(StandardCharsets.UTF_8), stats.getCurrentMaxStrValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getNumberOfValues()); stats.addStrValue("charlie"); Assert.assertArrayEquals("bob".getBytes(StandardCharsets.UTF_8), stats.getCurrentMinStrValue()); Assert.assertArrayEquals( "charlie".getBytes(StandardCharsets.UTF_8), stats.getCurrentMaxStrValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getNumberOfValues()); stats.addStrValue("alice"); Assert.assertArrayEquals( "alice".getBytes(StandardCharsets.UTF_8), stats.getCurrentMinStrValue()); Assert.assertArrayEquals( "charlie".getBytes(StandardCharsets.UTF_8), stats.getCurrentMaxStrValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getNumberOfValues()); Assert.assertNull(stats.getCurrentMinRealValue()); Assert.assertNull(stats.getCurrentMaxRealValue()); @@ -55,22 +72,28 @@ public void testMinMaxStrNonCol() throws Exception { @Test public void testMinMaxInt() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); stats.addIntValue(BigInteger.valueOf(5)); Assert.assertEquals(BigInteger.valueOf((5)), stats.getCurrentMinIntValue()); Assert.assertEquals(BigInteger.valueOf((5)), stats.getCurrentMaxIntValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getNumberOfValues()); stats.addIntValue(BigInteger.valueOf(6)); Assert.assertEquals(BigInteger.valueOf((5)), stats.getCurrentMinIntValue()); Assert.assertEquals(BigInteger.valueOf((6)), stats.getCurrentMaxIntValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getNumberOfValues()); stats.addIntValue(BigInteger.valueOf(4)); Assert.assertEquals(BigInteger.valueOf((4)), stats.getCurrentMinIntValue()); Assert.assertEquals(BigInteger.valueOf((6)), stats.getCurrentMaxIntValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getNumberOfValues()); Assert.assertNull(stats.getCurrentMinRealValue()); Assert.assertNull(stats.getCurrentMaxRealValue()); @@ -82,22 +105,28 @@ public void testMinMaxInt() throws Exception { @Test public void testMinMaxReal() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); stats.addRealValue(1.0); Assert.assertEquals(Double.valueOf(1), stats.getCurrentMinRealValue()); Assert.assertEquals(Double.valueOf(1), stats.getCurrentMaxRealValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 1 : -1, stats.getNumberOfValues()); stats.addRealValue(1.5); Assert.assertEquals(Double.valueOf(1), stats.getCurrentMinRealValue()); Assert.assertEquals(Double.valueOf(1.5), stats.getCurrentMaxRealValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 2 : -1, stats.getNumberOfValues()); stats.addRealValue(.8); Assert.assertEquals(Double.valueOf(.8), stats.getCurrentMinRealValue()); Assert.assertEquals(Double.valueOf(1.5), stats.getCurrentMaxRealValue()); - Assert.assertEquals(-1, stats.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 3 : -1, stats.getNumberOfValues()); Assert.assertNull(stats.getCurrentMinIntValue()); Assert.assertNull(stats.getCurrentMaxIntValue()); @@ -109,7 +138,7 @@ public void testMinMaxReal() throws Exception { @Test public void testIncCurrentNullCount() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); Assert.assertEquals(0, stats.getCurrentNullCount()); stats.incCurrentNullCount(); @@ -120,7 +149,7 @@ public void testIncCurrentNullCount() throws Exception { @Test public void testMaxLength() throws Exception { - RowBufferStats stats = new RowBufferStats("COL1"); + RowBufferStats stats = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); Assert.assertEquals(0, stats.getCurrentMaxLength()); stats.setCurrentMaxLength(100L); @@ -132,8 +161,8 @@ public void testMaxLength() throws Exception { @Test public void testGetCombinedStats() throws Exception { // Test for Integers - RowBufferStats one = new RowBufferStats("COL1"); - RowBufferStats two = new RowBufferStats("COL1"); + RowBufferStats one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + RowBufferStats two = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addIntValue(BigInteger.valueOf(2)); one.addIntValue(BigInteger.valueOf(4)); @@ -150,17 +179,18 @@ public void testGetCombinedStats() throws Exception { RowBufferStats result = RowBufferStats.getCombinedStats(one, two); Assert.assertEquals(BigInteger.valueOf(1), result.getCurrentMinIntValue()); Assert.assertEquals(BigInteger.valueOf(8), result.getCurrentMaxIntValue()); - Assert.assertEquals(-1, result.getDistinctValues()); - Assert.assertEquals(2, result.getCurrentNullCount()); + Assert.assertEquals(enableNDVAndNV ? 7 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 8 : -1, result.getNumberOfValues()); + Assert.assertEquals(2, result.getCurrentNullCount()); Assert.assertNull(result.getCurrentMinStrValue()); Assert.assertNull(result.getCurrentMaxStrValue()); Assert.assertNull(result.getCurrentMinRealValue()); Assert.assertNull(result.getCurrentMaxRealValue()); // Test for Reals - one = new RowBufferStats("COL1"); - two = new RowBufferStats("COL1"); + one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + two = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addRealValue(2d); one.addRealValue(4d); @@ -175,9 +205,10 @@ public void testGetCombinedStats() throws Exception { result = RowBufferStats.getCombinedStats(one, two); Assert.assertEquals(Double.valueOf(1), result.getCurrentMinRealValue()); Assert.assertEquals(Double.valueOf(8), result.getCurrentMaxRealValue()); - Assert.assertEquals(-1, result.getDistinctValues()); - Assert.assertEquals(0, result.getCurrentNullCount()); + Assert.assertEquals(enableNDVAndNV ? 7 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 8 : -1, result.getNumberOfValues()); + Assert.assertEquals(0, result.getCurrentNullCount()); Assert.assertNull(result.getCollationDefinitionString()); Assert.assertNull(result.getCurrentMinStrValue()); Assert.assertNull(result.getCurrentMaxStrValue()); @@ -185,8 +216,8 @@ public void testGetCombinedStats() throws Exception { Assert.assertNull(result.getCurrentMaxIntValue()); // Test for Strings without collation - one = new RowBufferStats("COL1"); - two = new RowBufferStats("COL1"); + one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + two = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addStrValue("alpha"); one.addStrValue("d"); @@ -205,9 +236,10 @@ public void testGetCombinedStats() throws Exception { result = RowBufferStats.getCombinedStats(one, two); Assert.assertArrayEquals("a".getBytes(StandardCharsets.UTF_8), result.getCurrentMinStrValue()); Assert.assertArrayEquals("g".getBytes(StandardCharsets.UTF_8), result.getCurrentMaxStrValue()); - Assert.assertEquals(-1, result.getDistinctValues()); Assert.assertEquals(2, result.getCurrentNullCount()); Assert.assertEquals(5, result.getCurrentMaxLength()); + Assert.assertEquals(enableNDVAndNV ? 7 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 8 : -1, result.getNumberOfValues()); Assert.assertNull(result.getCurrentMinRealValue()); Assert.assertNull(result.getCurrentMaxRealValue()); @@ -218,8 +250,8 @@ public void testGetCombinedStats() throws Exception { @Test public void testGetCombinedStatsNull() throws Exception { // Test for Integers - RowBufferStats one = new RowBufferStats("COL1"); - RowBufferStats two = new RowBufferStats("COL1"); + RowBufferStats one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + RowBufferStats two = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addIntValue(BigInteger.valueOf(2)); one.addIntValue(BigInteger.valueOf(4)); @@ -231,7 +263,9 @@ public void testGetCombinedStatsNull() throws Exception { RowBufferStats result = RowBufferStats.getCombinedStats(one, two); Assert.assertEquals(BigInteger.valueOf(2), result.getCurrentMinIntValue()); Assert.assertEquals(BigInteger.valueOf(8), result.getCurrentMaxIntValue()); - Assert.assertEquals(-1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getNumberOfValues()); + Assert.assertEquals(2, result.getCurrentNullCount()); Assert.assertNull(result.getCurrentMinStrValue()); @@ -240,7 +274,7 @@ public void testGetCombinedStatsNull() throws Exception { Assert.assertNull(result.getCurrentMaxRealValue()); // Test for Reals - one = new RowBufferStats("COL1"); + one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addRealValue(2d); one.addRealValue(4d); @@ -250,7 +284,8 @@ public void testGetCombinedStatsNull() throws Exception { result = RowBufferStats.getCombinedStats(one, two); Assert.assertEquals(Double.valueOf(2), result.getCurrentMinRealValue()); Assert.assertEquals(Double.valueOf(8), result.getCurrentMaxRealValue()); - Assert.assertEquals(-1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getNumberOfValues()); Assert.assertEquals(0, result.getCurrentNullCount()); Assert.assertNull(result.getCurrentMinStrValue()); @@ -259,8 +294,8 @@ public void testGetCombinedStatsNull() throws Exception { Assert.assertNull(result.getCurrentMaxIntValue()); // Test for Strings - one = new RowBufferStats("COL1"); - two = new RowBufferStats("COL1"); + one = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); + two = new RowBufferStats("COL1", enableNDVAndNV, enableNDVAndNV); one.addStrValue("alpha"); one.addStrValue("d"); @@ -272,7 +307,9 @@ public void testGetCombinedStatsNull() throws Exception { Assert.assertArrayEquals( "alpha".getBytes(StandardCharsets.UTF_8), result.getCurrentMinStrValue()); Assert.assertArrayEquals("g".getBytes(StandardCharsets.UTF_8), result.getCurrentMaxStrValue()); - Assert.assertEquals(-1, result.getDistinctValues()); + + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getDistinctValues()); + Assert.assertEquals(enableNDVAndNV ? 4 : -1, result.getNumberOfValues()); Assert.assertEquals(1, result.getCurrentNullCount()); Assert.assertNull(result.getCurrentMinRealValue()); diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java index e1cb764dd..246753fbe 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java @@ -1,9 +1,15 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal; import static java.time.ZoneOffset.UTC; +import static net.snowflake.ingest.utils.Constants.EP_NV_UNKNOWN; import static net.snowflake.ingest.utils.ParameterProvider.ENABLE_NEW_JSON_PARSING_LOGIC_DEFAULT; import static net.snowflake.ingest.utils.ParameterProvider.MAX_ALLOWED_ROW_SIZE_IN_BYTES_DEFAULT; import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import static org.junit.Assert.fail; import java.io.IOException; @@ -139,6 +145,67 @@ static List createSchema() { return columns; } + static List createStrcuturedDataTypeSchema() { + ColumnMetadata colObject = new ColumnMetadata(); + colObject.setName("COLOBJECT"); + colObject.setPhysicalType("LOB"); + colObject.setNullable(true); + colObject.setLogicalType("OBJECT"); + colObject.setSourceIcebergDataType( + "{\n" + + " \"type\": \"struct\",\n" + + " \"fields\":\n" + + " [\n" + + " {\n" + + " \"id\": 4,\n" + + " \"name\": \"a\",\n" + + " \"required\": false,\n" + + " \"type\": \"int\"\n" + + " },\n" + + " {\n" + + " \"id\": 5,\n" + + " \"name\": \"b\",\n" + + " \"required\": false,\n" + + " \"type\": \"string\"\n" + + " }\n" + + " ]\n" + + "}"); + + ColumnMetadata colMap = new ColumnMetadata(); + colMap.setName("COLMAP"); + colMap.setPhysicalType("LOB"); + colMap.setNullable(true); + colMap.setLogicalType("MAP"); + colMap.setSourceIcebergDataType( + "{\n" + + " \"type\": \"map\",\n" + + " \"key-id\": 6,\n" + + " \"key\": \"string\",\n" + + " \"value-id\": 7,\n" + + " \"value\": \"boolean\",\n" + + " \"value-required\": false\n" + + "}"); + + ColumnMetadata colArray = new ColumnMetadata(); + colArray.setName("COLARRAY"); + colArray.setPhysicalType("LOB"); + colArray.setNullable(true); + colArray.setLogicalType("ARRAY"); + colArray.setSourceIcebergDataType( + "{\n" + + " \"type\": \"list\",\n" + + " \"element-id\": 8,\n" + + " \"element\": \"long\",\n" + + " \"element-required\": false\n" + + "}"); + + List columns = Arrays.asList(colObject, colMap, colArray); + for (int i = 0; i < columns.size(); i++) { + columns.get(i).setOrdinal(i + 1); + } + return columns; + } + private AbstractRowBuffer createTestBuffer(OpenChannelRequest.OnErrorOption onErrorOption) { ChannelRuntimeState initialState = new ChannelRuntimeState("0", 0L, true); return AbstractRowBuffer.createRowBuffer( @@ -154,6 +221,8 @@ private AbstractRowBuffer createTestBuffer(OpenChannelRequest.OnErrorOption o Constants.BdecParquetCompression.GZIP, ENABLE_NEW_JSON_PARSING_LOGIC_DEFAULT, isIcebergMode ? Optional.of(1) : Optional.empty(), + isIcebergMode, + isIcebergMode, isIcebergMode), null, isIcebergMode @@ -572,7 +641,9 @@ public void testBuildEpInfoFromStats() { RowBufferStats stats1 = new RowBufferStats( "intColumn", - Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("intColumn")); + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("intColumn"), + isIcebergMode, + isIcebergMode); stats1.addIntValue(BigInteger.valueOf(2)); stats1.addIntValue(BigInteger.valueOf(10)); stats1.addIntValue(BigInteger.valueOf(1)); @@ -580,7 +651,9 @@ public void testBuildEpInfoFromStats() { RowBufferStats stats2 = new RowBufferStats( "strColumn", - Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).id(2).named("strColumn")); + Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).id(2).named("strColumn"), + isIcebergMode, + isIcebergMode); stats2.addStrValue("alice"); stats2.addStrValue("bob"); stats2.incCurrentNullCount(); @@ -588,12 +661,13 @@ public void testBuildEpInfoFromStats() { colStats.put("intColumn", stats1); colStats.put("strColumn", stats2); - EpInfo result = AbstractRowBuffer.buildEpInfoFromStats(2, colStats, !isIcebergMode); + EpInfo result = + AbstractRowBuffer.buildEpInfoFromStats(2, colStats, !isIcebergMode, isIcebergMode); Map columnResults = result.getColumnEps(); Assert.assertEquals(2, columnResults.keySet().size()); FileColumnProperties strColumnResult = columnResults.get("strColumn"); - Assert.assertEquals(-1, strColumnResult.getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 2 : -1, strColumnResult.getDistinctValues()); Assert.assertEquals( Hex.encodeHexString("alice".getBytes(StandardCharsets.UTF_8)), strColumnResult.getMinStrValue()); @@ -603,7 +677,7 @@ public void testBuildEpInfoFromStats() { Assert.assertEquals(1, strColumnResult.getNullCount()); FileColumnProperties intColumnResult = columnResults.get("intColumn"); - Assert.assertEquals(-1, intColumnResult.getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 3 : -1, intColumnResult.getDistinctValues()); Assert.assertEquals(BigInteger.valueOf(1), intColumnResult.getMinIntValue()); Assert.assertEquals(BigInteger.valueOf(10), intColumnResult.getMaxIntValue()); Assert.assertEquals(0, intColumnResult.getNullCount()); @@ -618,23 +692,28 @@ public void testBuildEpInfoFromNullColumnStats() { RowBufferStats stats1 = new RowBufferStats( intColName, - Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named(intColName)); + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named(intColName), + isIcebergMode, + isIcebergMode); RowBufferStats stats2 = new RowBufferStats( realColName, - Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).id(2).named(realColName)); + Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).id(2).named(realColName), + isIcebergMode, + isIcebergMode); stats1.incCurrentNullCount(); stats2.incCurrentNullCount(); colStats.put(intColName, stats1); colStats.put(realColName, stats2); - EpInfo result = AbstractRowBuffer.buildEpInfoFromStats(2, colStats, !isIcebergMode); + EpInfo result = + AbstractRowBuffer.buildEpInfoFromStats(2, colStats, !isIcebergMode, isIcebergMode); Map columnResults = result.getColumnEps(); Assert.assertEquals(2, columnResults.keySet().size()); FileColumnProperties intColumnResult = columnResults.get(intColName); - Assert.assertEquals(-1, intColumnResult.getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 0 : -1, intColumnResult.getDistinctValues()); Assert.assertEquals( FileColumnProperties.DEFAULT_MIN_MAX_INT_VAL_FOR_EP, intColumnResult.getMinIntValue()); Assert.assertEquals( @@ -643,7 +722,7 @@ public void testBuildEpInfoFromNullColumnStats() { Assert.assertEquals(0, intColumnResult.getMaxLength()); FileColumnProperties realColumnResult = columnResults.get(realColName); - Assert.assertEquals(-1, intColumnResult.getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 0 : -1, intColumnResult.getDistinctValues()); Assert.assertEquals( FileColumnProperties.DEFAULT_MIN_MAX_REAL_VAL_FOR_EP, realColumnResult.getMinRealValue()); Assert.assertEquals( @@ -659,7 +738,9 @@ public void testInvalidEPInfo() { RowBufferStats stats1 = new RowBufferStats( "intColumn", - Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("intColumn")); + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("intColumn"), + isIcebergMode, + isIcebergMode); stats1.addIntValue(BigInteger.valueOf(2)); stats1.addIntValue(BigInteger.valueOf(10)); stats1.addIntValue(BigInteger.valueOf(1)); @@ -667,7 +748,9 @@ public void testInvalidEPInfo() { RowBufferStats stats2 = new RowBufferStats( "strColumn", - Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).id(2).named("strColumn")); + Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).id(2).named("strColumn"), + isIcebergMode, + isIcebergMode); stats2.addStrValue("alice"); stats2.incCurrentNullCount(); stats2.incCurrentNullCount(); @@ -676,7 +759,7 @@ public void testInvalidEPInfo() { colStats.put("strColumn", stats2); try { - AbstractRowBuffer.buildEpInfoFromStats(1, colStats, !isIcebergMode); + AbstractRowBuffer.buildEpInfoFromStats(1, colStats, !isIcebergMode, isIcebergMode); fail("should fail when row count is smaller than null count."); } catch (SFException e) { Assert.assertEquals(ErrorCode.INTERNAL_ERROR.getMessageCode(), e.getVendorCode()); @@ -789,33 +872,36 @@ private void testStatsE2EHelper(AbstractRowBuffer rowBuffer) { Assert.assertEquals( BigInteger.valueOf(10), columnEpStats.get("colTinyInt").getCurrentMinIntValue()); Assert.assertEquals(0, columnEpStats.get("colTinyInt").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("colTinyInt").getDistinctValues()); + Assert.assertEquals( + isIcebergMode ? 2 : -1, columnEpStats.get("colTinyInt").getDistinctValues()); Assert.assertEquals( BigInteger.valueOf(1), columnEpStats.get("COLTINYINT").getCurrentMaxIntValue()); Assert.assertEquals( BigInteger.valueOf(1), columnEpStats.get("COLTINYINT").getCurrentMinIntValue()); Assert.assertEquals(0, columnEpStats.get("COLTINYINT").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("COLTINYINT").getDistinctValues()); + Assert.assertEquals( + isIcebergMode ? 1 : -1, columnEpStats.get("COLTINYINT").getDistinctValues()); Assert.assertEquals( BigInteger.valueOf(3), columnEpStats.get("COLSMALLINT").getCurrentMaxIntValue()); Assert.assertEquals( BigInteger.valueOf(2), columnEpStats.get("COLSMALLINT").getCurrentMinIntValue()); Assert.assertEquals(0, columnEpStats.get("COLSMALLINT").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("COLSMALLINT").getDistinctValues()); + Assert.assertEquals( + isIcebergMode ? 2 : -1, columnEpStats.get("COLSMALLINT").getDistinctValues()); Assert.assertEquals(BigInteger.valueOf(3), columnEpStats.get("COLINT").getCurrentMaxIntValue()); Assert.assertEquals(BigInteger.valueOf(3), columnEpStats.get("COLINT").getCurrentMinIntValue()); Assert.assertEquals(1L, columnEpStats.get("COLINT").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("COLINT").getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 1 : -1, columnEpStats.get("COLINT").getDistinctValues()); Assert.assertEquals( BigInteger.valueOf(40), columnEpStats.get("COLBIGINT").getCurrentMaxIntValue()); Assert.assertEquals( BigInteger.valueOf(4), columnEpStats.get("COLBIGINT").getCurrentMinIntValue()); Assert.assertEquals(0, columnEpStats.get("COLBIGINT").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("COLBIGINT").getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 2 : -1, columnEpStats.get("COLBIGINT").getDistinctValues()); Assert.assertArrayEquals( "2".getBytes(StandardCharsets.UTF_8), columnEpStats.get("COLCHAR").getCurrentMinStrValue()); @@ -823,7 +909,7 @@ private void testStatsE2EHelper(AbstractRowBuffer rowBuffer) { "alice".getBytes(StandardCharsets.UTF_8), columnEpStats.get("COLCHAR").getCurrentMaxStrValue()); Assert.assertEquals(0, columnEpStats.get("COLCHAR").getCurrentNullCount()); - Assert.assertEquals(-1, columnEpStats.get("COLCHAR").getDistinctValues()); + Assert.assertEquals(isIcebergMode ? 2 : -1, columnEpStats.get("COLCHAR").getDistinctValues()); // Confirm we reset ChannelData resetResults = rowBuffer.flush(); @@ -1102,7 +1188,9 @@ private void testMaxInsertRowsBatchSizeHelper(OpenChannelRequest.OnErrorOption o colBinary.setLogicalType("BINARY"); colBinary.setLength(8 * 1024 * 1024); colBinary.setByteLength(8 * 1024 * 1024); - colBinary.setSourceIcebergDataType("\"binary\""); + if (isIcebergMode) { + colBinary.setSourceIcebergDataType("\"binary\""); + } byte[] arr = new byte[8 * 1024 * 1024]; innerBuffer.setupSchema(Collections.singletonList(colBinary)); @@ -1335,6 +1423,9 @@ private void testE2EBooleanHelper(OpenChannelRequest.OnErrorOption onErrorOption colBoolean.setNullable(true); colBoolean.setLogicalType("BOOLEAN"); colBoolean.setScale(0); + if (isIcebergMode) { + colBoolean.setSourceIcebergDataType("\"boolean\""); + } innerBuffer.setupSchema(Collections.singletonList(colBoolean)); @@ -1938,6 +2029,93 @@ public void testParquetFileNameMetadata() throws IOException { Assert.assertEquals(filePath, reader.getKeyValueMetadata().get(Constants.PRIMARY_FILE_ID_KEY)); } + @Test + public void testStructuredStatsE2E() { + if (!isIcebergMode) return; + testStructuredStatsE2EHelper(createTestBuffer(OpenChannelRequest.OnErrorOption.CONTINUE)); + testStructuredStatsE2EHelper(createTestBuffer(OpenChannelRequest.OnErrorOption.ABORT)); + testStructuredStatsE2EHelper(createTestBuffer(OpenChannelRequest.OnErrorOption.SKIP_BATCH)); + } + + private void testStructuredStatsE2EHelper(AbstractRowBuffer rowBuffer) { + rowBuffer.setupSchema(createStrcuturedDataTypeSchema()); + Map row1 = new HashMap<>(); + row1.put( + "COLOBJECT", + new HashMap() { + { + put("a", 1); + put("b", "string1"); + } + }); + row1.put( + "COLMAP", + new HashMap() { + { + put("key1", true); + put("key2", true); + } + }); + row1.put("COLARRAY", Arrays.asList(1, 1, 1)); + + Map row2 = new HashMap<>(); + row2.put( + "COLOBJECT", + new HashMap() { + { + put("a", 2); + put("b", null); + } + }); + row2.put("COLMAP", null); + row2.put("COLARRAY", Arrays.asList(1, null)); + + InsertValidationResponse response = rowBuffer.insertRows(Arrays.asList(row1, row2), null, null); + Assert.assertFalse(response.hasErrors()); + ChannelData result = rowBuffer.flush(); + Map columnEpStats = result.getColumnEps(); + + assertThat(columnEpStats.get("COLOBJECT.a").getCurrentMinIntValue()) + .isEqualTo(BigInteger.valueOf(1)); + assertThat(columnEpStats.get("COLOBJECT.a").getCurrentMaxIntValue()) + .isEqualTo(BigInteger.valueOf(2)); + assertThat(columnEpStats.get("COLOBJECT.a").getCurrentNullCount()).isEqualTo(0); + assertThat(columnEpStats.get("COLOBJECT.a").getDistinctValues()).isEqualTo(2); + assertThat(columnEpStats.get("COLOBJECT.a").getNumberOfValues()).isEqualTo(EP_NV_UNKNOWN); + + assertThat(columnEpStats.get("COLOBJECT.b").getCurrentMinStrValue()) + .isEqualTo("string1".getBytes(StandardCharsets.UTF_8)); + assertThat(columnEpStats.get("COLOBJECT.b").getCurrentMaxStrValue()) + .isEqualTo("string1".getBytes(StandardCharsets.UTF_8)); + assertThat(columnEpStats.get("COLOBJECT.b").getCurrentNullCount()).isEqualTo(1); + assertThat(columnEpStats.get("COLOBJECT.b").getDistinctValues()).isEqualTo(1); + assertThat(columnEpStats.get("COLOBJECT.b").getNumberOfValues()).isEqualTo(EP_NV_UNKNOWN); + + assertThat(columnEpStats.get("COLMAP.key_value.key").getCurrentMinStrValue()) + .isEqualTo("key1".getBytes(StandardCharsets.UTF_8)); + assertThat(columnEpStats.get("COLMAP.key_value.key").getCurrentMaxStrValue()) + .isEqualTo("key2".getBytes(StandardCharsets.UTF_8)); + assertThat(columnEpStats.get("COLMAP.key_value.key").getCurrentNullCount()).isEqualTo(1); + assertThat(columnEpStats.get("COLMAP.key_value.key").getDistinctValues()).isEqualTo(2); + assertThat(columnEpStats.get("COLMAP.key_value.key").getNumberOfValues()).isEqualTo(2); + + assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentMinIntValue()) + .isEqualTo(BigInteger.ONE); + assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentMaxIntValue()) + .isEqualTo(BigInteger.ONE); + assertThat(columnEpStats.get("COLMAP.key_value.value").getCurrentNullCount()).isEqualTo(1); + assertThat(columnEpStats.get("COLMAP.key_value.value").getDistinctValues()).isEqualTo(1); + assertThat(columnEpStats.get("COLMAP.key_value.value").getNumberOfValues()).isEqualTo(2); + + assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentMinIntValue()) + .isEqualTo(BigInteger.ONE); + assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentMaxIntValue()) + .isEqualTo(BigInteger.ONE); + assertThat(columnEpStats.get("COLARRAY.list.element").getCurrentNullCount()).isEqualTo(1); + assertThat(columnEpStats.get("COLARRAY.list.element").getDistinctValues()).isEqualTo(1); + assertThat(columnEpStats.get("COLARRAY.list.element").getNumberOfValues()).isEqualTo(4); + } + private static Thread getThreadThatWaitsForLockReleaseAndFlushes( final ParquetRowBuffer bufferUnderTest, final CountDownLatch latch, diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeParquetValueParserTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeParquetValueParserTest.java index 64db57c31..a90e26e9d 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeParquetValueParserTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeParquetValueParserTest.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal; import static java.time.ZoneOffset.UTC; @@ -29,7 +33,7 @@ public void parseValueFixedSB1ToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( 12, @@ -61,7 +65,7 @@ public void parseValueFixedSB2ToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( 1234, @@ -93,7 +97,7 @@ public void parseValueFixedSB4ToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( 123456789, @@ -125,7 +129,7 @@ public void parseValueFixedSB8ToInt64() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( 123456789987654321L, @@ -157,7 +161,7 @@ public void parseValueFixedSB16ToByteArray() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( new BigDecimal("91234567899876543219876543211234567891"), @@ -191,7 +195,7 @@ public void parseValueFixedDecimalToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( new BigDecimal("12345.54321"), @@ -221,7 +225,7 @@ public void parseValueDouble() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( 12345.54321d, @@ -251,7 +255,7 @@ public void parseValueBoolean() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( true, @@ -281,7 +285,7 @@ public void parseValueBinary() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "1234abcd".getBytes(), @@ -326,7 +330,7 @@ private void testJsonWithLogicalType(String logicalType, boolean enableNewJsonPa String var = "{\"key1\":-879869596,\"key2\":\"value2\",\"key3\":null," + "\"key4\":{\"key41\":0.032437,\"key42\":\"value42\",\"key43\":null}}"; - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( var, @@ -376,7 +380,7 @@ private void testNullJsonWithLogicalType(String var, boolean enableNewJsonParsin .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( var, @@ -417,7 +421,7 @@ public void parseValueArrayToBinaryInternal(boolean enableNewJsonParsingLogic) { input.put("b", "2"); input.put("c", "3"); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( input, @@ -455,7 +459,7 @@ public void parseValueTextToBinary() { String text = "This is a sample text! Length is bigger than 32 bytes :)"; - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( text, @@ -492,7 +496,7 @@ public void parseValueTimestampNtzSB4Error() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); SFException exception = Assert.assertThrows( SFException.class, @@ -520,7 +524,7 @@ public void parseValueTimestampNtzSB8ToINT64() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "2013-04-28T20:57:01.000", @@ -551,7 +555,7 @@ public void parseValueTimestampNtzSB16ToByteArray() { .scale(9) // nanos .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "2022-09-18T22:05:07.123456789", @@ -583,7 +587,7 @@ public void parseValueDateToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "2021-01-01", @@ -614,7 +618,7 @@ public void parseValueTimeSB4ToInt32() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "01:00:00", @@ -645,7 +649,7 @@ public void parseValueTimeSB8ToInt64() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); ParquetBufferValue pv = SnowflakeParquetValueParser.parseColumnValueToParquet( "01:00:00.123", @@ -676,7 +680,7 @@ public void parseValueTimeSB16Error() { .nullable(true) .build(); - RowBufferStats rowBufferStats = new RowBufferStats("COL1"); + RowBufferStats rowBufferStats = new RowBufferStats("COL1", false, false); SFException exception = Assert.assertThrows( SFException.class, diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestClientTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestClientTest.java index 627593e82..e3f563fb6 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestClientTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestClientTest.java @@ -506,8 +506,12 @@ public void testRegisterBlobRequestCreationSuccess() throws Exception { columnEps.put( "column", new RowBufferStats( - "COL1", Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"))); - EpInfo epInfo = AbstractRowBuffer.buildEpInfoFromStats(1, columnEps, !isIcebergMode); + "COL1", + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"), + isIcebergMode, + isIcebergMode)); + EpInfo epInfo = + AbstractRowBuffer.buildEpInfoFromStats(1, columnEps, !isIcebergMode, isIcebergMode); ChunkMetadata chunkMetadata = ChunkMetadata.builder() @@ -558,8 +562,12 @@ private Pair, Set> getRetryBlobMetadata( columnEps.put( "column", new RowBufferStats( - "COL1", Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"))); - EpInfo epInfo = AbstractRowBuffer.buildEpInfoFromStats(1, columnEps, !isIcebergMode); + "COL1", + Types.optional(PrimitiveType.PrimitiveTypeName.INT32).id(1).named("COL1"), + isIcebergMode, + isIcebergMode)); + EpInfo epInfo = + AbstractRowBuffer.buildEpInfoFromStats(1, columnEps, !isIcebergMode, isIcebergMode); ChannelMetadata channelMetadata1 = ChannelMetadata.builder() diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/AbstractDataTypeTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/AbstractDataTypeTest.java index 545a4827b..baa370e16 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/AbstractDataTypeTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/AbstractDataTypeTest.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal.datatypes; import static net.snowflake.ingest.utils.Constants.ROLE; @@ -96,20 +100,23 @@ protected void setUp( conn.createStatement().execute(String.format("use database %s;", databaseName)); conn.createStatement().execute(String.format("use schema %s;", schemaName)); - switch (serializationPolicy) { - case COMPATIBLE: - conn.createStatement() - .execute( - String.format( - "alter schema %s set STORAGE_SERIALIZATION_POLICY = 'COMPATIBLE';", - schemaName)); - break; - case OPTIMIZED: - conn.createStatement() - .execute( - String.format( - "alter schema %s set STORAGE_SERIALIZATION_POLICY = 'OPTIMIZED';", schemaName)); - break; + if (isIceberg) { + switch (serializationPolicy) { + case COMPATIBLE: + conn.createStatement() + .execute( + String.format( + "alter schema %s set STORAGE_SERIALIZATION_POLICY = 'COMPATIBLE';", + schemaName)); + break; + case OPTIMIZED: + conn.createStatement() + .execute( + String.format( + "alter schema %s set STORAGE_SERIALIZATION_POLICY = 'OPTIMIZED';", + schemaName)); + break; + } } conn.createStatement().execute(String.format("use warehouse %s;", TestUtils.getWarehouse())); @@ -163,7 +170,7 @@ protected String createTable(String dataType) throws SQLException { protected String createIcebergTable(String dataType) throws SQLException { String tableName = getRandomIdentifier(); String baseLocation = - String.format("%s/%s/%s", databaseName, dataType.replace(" ", "_"), tableName); + String.format("SDK_IT/%s/%s/%s", databaseName, dataType.replace(" ", "_"), tableName); conn.createStatement() .execute( String.format( diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergNumericTypesIT.java b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergNumericTypesIT.java index e4b0783d4..63dd71ddf 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergNumericTypesIT.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergNumericTypesIT.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal.datatypes; import java.math.BigDecimal; diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergStructuredIT.java b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergStructuredIT.java index f99e82d30..ddf473dc5 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergStructuredIT.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/IcebergStructuredIT.java @@ -1,3 +1,7 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + package net.snowflake.ingest.streaming.internal.datatypes; import com.fasterxml.jackson.databind.JsonNode; @@ -77,14 +81,20 @@ public void testStructuredDataType() throws Exception { .extracting(SFException::getVendorCode) .isEqualTo(ErrorCode.INVALID_FORMAT_ROW.getMessageCode()); - /* Null struct, map list. TODO: SNOW-1727532 Should be fixed with null values EP calculation. */ + /* Null struct, map list. */ Assertions.assertThatThrownBy( () -> assertStructuredDataType("object(a int, b string, c boolean) not null", null)) - .isInstanceOf(NullPointerException.class); + .isInstanceOf(SFException.class) + .extracting("vendorCode") + .isEqualTo(ErrorCode.INVALID_FORMAT_ROW.getMessageCode()); Assertions.assertThatThrownBy(() -> assertStructuredDataType("map(string, int) not null", null)) - .isInstanceOf(NullPointerException.class); + .isInstanceOf(SFException.class) + .extracting("vendorCode") + .isEqualTo(ErrorCode.INVALID_FORMAT_ROW.getMessageCode()); Assertions.assertThatThrownBy(() -> assertStructuredDataType("array(int) not null", null)) - .isInstanceOf(NullPointerException.class); + .isInstanceOf(SFException.class) + .extracting("vendorCode") + .isEqualTo(ErrorCode.INVALID_FORMAT_ROW.getMessageCode()); /* Nested data types. Should be fixed. Fixed in server side. */ Assertions.assertThatThrownBy( diff --git a/src/test/java/net/snowflake/ingest/utils/SubColumnFinderTest.java b/src/test/java/net/snowflake/ingest/utils/SubColumnFinderTest.java new file mode 100644 index 000000000..b5c538d00 --- /dev/null +++ b/src/test/java/net/snowflake/ingest/utils/SubColumnFinderTest.java @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2024 Snowflake Computing Inc. All rights reserved. + */ + +package net.snowflake.ingest.utils; + +import static net.snowflake.ingest.utils.Utils.concatDotPath; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.MessageTypeParser; +import org.junit.Test; + +public class SubColumnFinderTest { + + @Test + public void testFlatSchema() { + MessageType schema = + MessageTypeParser.parseMessageType( + "message schema {\n" + + " optional boolean BOOLEAN_COL = 1;\n" + + " optional int32 INT_COL = 2;\n" + + " optional int64 LONG_COL = 3;\n" + + " optional float FLOAT_COL = 4;\n" + + " optional double DOUBLE_COL = 5;\n" + + " optional int64 DECIMAL_COL (DECIMAL(10,5)) = 6;\n" + + " optional binary STRING_COL (STRING) = 7;\n" + + " optional fixed_len_byte_array(10) FIXED_COL = 8;\n" + + " optional binary BINARY_COL = 9;\n" + + " optional int32 DATE_COL (DATE) = 10;\n" + + " optional int64 TIME_COL (TIME(MICROS,false)) = 11;\n" + + " optional int64 TIMESTAMP_NTZ_COL (TIMESTAMP(MICROS,false)) = 12;\n" + + " optional int64 TIMESTAMP_LTZ_COL (TIMESTAMP(MICROS,true)) = 13;\n" + + "}\n"); + assertFindSubColumns(schema); + } + + @Test + public void testNestedSchema() { + MessageType schema = + MessageTypeParser.parseMessageType( + "message schema {\n" + + " optional group LIST_COL (LIST) = 1 {\n" + + " repeated group list {\n" + + " optional group element = 4 {\n" + + " optional group map_col (MAP) = 5 {\n" + + " repeated group key_value {\n" + + " required binary key (STRING) = 6;\n" + + " optional group value (LIST) = 7 {\n" + + " repeated group list {\n" + + " optional group element = 8 {\n" + + " optional int32 int_col = 9;\n" + + " optional boolean boolean_col = 10;\n" + + " optional group map_col (MAP) = 11 {\n" + + " repeated group key_value {\n" + + " required int32 key = 12;\n" + + " optional int32 value = 13;\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + " optional group obj_col = 14 {\n" + + " optional group list_col (LIST) = 15 {\n" + + " repeated group list {\n" + + " optional int32 element = 16;\n" + + " }\n" + + " }\n" + + " optional group map_col (MAP) = 17 {\n" + + " repeated group key_value {\n" + + " required binary key (STRING) = 18;\n" + + " optional binary value (STRING) = 19;\n" + + " }\n" + + " }\n" + + " }\n" + + " optional int32 int_col = 20;\n" + + " optional float float_col = 21;\n" + + " }\n" + + " }\n" + + " }\n" + + " optional group OBJ_COL = 2 {\n" + + " optional group obj_col = 22 {\n" + + " optional group map_col (MAP) = 23 {\n" + + " repeated group key_value {\n" + + " required int32 key = 24;\n" + + " optional int32 value = 25;\n" + + " }\n" + + " }\n" + + " }\n" + + " }\n" + + " optional double DOUBLE_COL = 3;\n" + + "}"); + assertFindSubColumns(schema); + } + + private void assertFindSubColumns(MessageType schema) { + SubColumnFinder subColumnFinder = new SubColumnFinder(schema); + for (String dotPath : getAllPossibleDotPath(schema)) { + assertThat(subColumnFinder.getSubColumns(dotPath)) + .usingRecursiveComparison() + .ignoringCollectionOrder() + .isEqualTo(findSubColumn(schema, dotPath)); + } + } + + private Iterable getAllPossibleDotPath(MessageType schema) { + Set dotPaths = new HashSet<>(); + for (ColumnDescriptor column : schema.getColumns()) { + String[] path = column.getPath(); + if (path.length == 0) { + continue; + } + String dotPath = path[0]; + dotPaths.add(dotPath); + for (int i = 1; i < path.length; i++) { + dotPath = concatDotPath(dotPath, path[i]); + dotPaths.add(dotPath); + } + } + return dotPaths; + } + + private List findSubColumn(MessageType schema, String dotPath) { + return schema.getColumns().stream() + .map(ColumnDescriptor::getPath) + .map(Utils::concatDotPath) + .filter( + s -> + s.startsWith(dotPath) + && (s.length() == dotPath.length() || s.charAt(dotPath.length()) == '.')) + .collect(Collectors.toList()); + } +}