From eeb56c00f1e4e83e35bdcb51a87e6feb931b6928 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 7 Dec 2022 20:26:05 +0000 Subject: [PATCH 01/29] fix issue --- .../net/snowflake/ingest/connection/RequestBuilder.java | 8 +++++--- .../snowflake/ingest/streaming/internal/FlushService.java | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/connection/RequestBuilder.java b/src/main/java/net/snowflake/ingest/connection/RequestBuilder.java index 2b438a8dd..b428d23eb 100644 --- a/src/main/java/net/snowflake/ingest/connection/RequestBuilder.java +++ b/src/main/java/net/snowflake/ingest/connection/RequestBuilder.java @@ -313,7 +313,7 @@ private static Properties loadProperties() { return properties; } - LOGGER.debug("Loaded project version " + properties.getProperty("version")); + LOGGER.info("Loaded project version " + properties.getProperty("version")); return properties; } @@ -345,9 +345,11 @@ private static String getDefaultUserAgent() { // Add Java Version final String javaVersion = System.getProperty("java.version"); - defaultUserAgent.append(JAVA_USER_AGENT + "/" + javaVersion); + defaultUserAgent.append(JAVA_USER_AGENT + "/").append(javaVersion); + String userAgent = defaultUserAgent.toString(); - return defaultUserAgent.toString(); + LOGGER.info("Default user agent " + userAgent); + return userAgent; } private static String buildCustomUserAgent(String additionalUserAgentInfo) { diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index a6a80f275..7d75a34d3 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -141,7 +141,7 @@ List>> getData() { this.isNeedFlush = false; this.lastFlushTime = System.currentTimeMillis(); this.isTestMode = isTestMode; - this.latencyTimerContextMap = new HashMap<>(); + this.latencyTimerContextMap = new ConcurrentHashMap<>(); this.bdecVersion = this.owningClient.getParameterProvider().getBlobFormatVersion(); createWorkers(); } From 911c63fc2cfc6bea1ebcc595e297537da76ba6c8 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Thu, 16 Feb 2023 22:57:55 -0800 Subject: [PATCH 02/29] encryption --- .../streaming/internal/AbstractRowBuffer.java | 15 +-- .../streaming/internal/FlushService.java | 91 +++++++++---------- 2 files changed, 47 insertions(+), 59 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index 973b6e81b..5c7140fcb 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -5,25 +5,14 @@ package net.snowflake.ingest.streaming.internal; import java.time.ZoneId; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; +import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Consumer; import java.util.stream.Collectors; import net.snowflake.ingest.streaming.InsertValidationResponse; import net.snowflake.ingest.streaming.OpenChannelRequest; -import net.snowflake.ingest.utils.Constants; -import net.snowflake.ingest.utils.ErrorCode; -import net.snowflake.ingest.utils.Logging; -import net.snowflake.ingest.utils.Pair; -import net.snowflake.ingest.utils.SFException; -import net.snowflake.ingest.utils.Utils; +import net.snowflake.ingest.utils.*; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.util.VisibleForTesting; diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index b4b318a8c..022e85c38 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -4,11 +4,7 @@ package net.snowflake.ingest.streaming.internal; -import static net.snowflake.ingest.utils.Constants.BLOB_EXTENSION_TYPE; -import static net.snowflake.ingest.utils.Constants.DISABLE_BACKGROUND_FLUSH; -import static net.snowflake.ingest.utils.Constants.MAX_BLOB_SIZE_IN_BYTES; -import static net.snowflake.ingest.utils.Constants.MAX_THREAD_COUNT; -import static net.snowflake.ingest.utils.Constants.THREAD_SHUTDOWN_TIMEOUT_IN_SEC; +import static net.snowflake.ingest.utils.Constants.*; import static net.snowflake.ingest.utils.Utils.getStackTrace; import com.codahale.metrics.Timer; @@ -17,35 +13,15 @@ import java.security.InvalidAlgorithmParameterException; import java.security.InvalidKeyException; import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; +import java.util.*; +import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.atomic.AtomicReference; import javax.crypto.BadPaddingException; import javax.crypto.IllegalBlockSizeException; import javax.crypto.NoSuchPaddingException; import net.snowflake.client.jdbc.SnowflakeSQLException; import net.snowflake.client.jdbc.internal.google.common.util.concurrent.ThreadFactoryBuilder; -import net.snowflake.ingest.utils.Constants; -import net.snowflake.ingest.utils.ErrorCode; -import net.snowflake.ingest.utils.Logging; -import net.snowflake.ingest.utils.Pair; -import net.snowflake.ingest.utils.SFException; -import net.snowflake.ingest.utils.Utils; +import net.snowflake.ingest.utils.*; import org.apache.arrow.util.VisibleForTesting; import org.apache.arrow.vector.VectorSchemaRoot; @@ -341,30 +317,53 @@ void distributeFlushTasks() { while (itr.hasNext()) { List>> blobData = new ArrayList<>(); - AtomicReference totalBufferSizeInBytes = new AtomicReference<>((float) 0); - + List> leftoverChannelsDataPerTable = new ArrayList<>(); + float totalBufferSizeInBytes = 0F; final String filePath = getFilePath(this.targetStage.getClientPrefix()); // Distribute work at table level, create a new blob if reaching the blob size limit - while (itr.hasNext() && totalBufferSizeInBytes.get() <= MAX_BLOB_SIZE_IN_BYTES) { - ConcurrentHashMap> table = - itr.next().getValue(); + while (itr.hasNext() || !leftoverChannelsDataPerTable.isEmpty()) { List> channelsDataPerTable = Collections.synchronizedList(new ArrayList<>()); - // Use parallel stream since getData could be the performance bottleneck when we have a high - // number of channels - table.values().parallelStream() - .forEach( - channel -> { - if (channel.isValid()) { - ChannelData data = channel.getData(filePath); - if (data != null) { - channelsDataPerTable.add(data); - totalBufferSizeInBytes.updateAndGet(v -> v + data.getBufferSize()); + if (!leftoverChannelsDataPerTable.isEmpty()) { + channelsDataPerTable.addAll(leftoverChannelsDataPerTable); + leftoverChannelsDataPerTable.clear(); + } else { + ConcurrentHashMap> table = + itr.next().getValue(); + // Use parallel stream since getData could be the performance bottleneck when we have a + // high number of channels + table.values().parallelStream() + .forEach( + channel -> { + if (channel.isValid()) { + ChannelData data = channel.getData(filePath); + if (data != null) { + channelsDataPerTable.add(data); + } } - } - }); + }); + } + if (!channelsDataPerTable.isEmpty()) { - blobData.add(channelsDataPerTable); + int idx = 0; + while (idx < channelsDataPerTable.size()) { + ChannelData channelData = channelsDataPerTable.get(idx); + totalBufferSizeInBytes += channelData.getBufferSize(); + if (totalBufferSizeInBytes > MAX_BLOB_SIZE_IN_BYTES + || (idx > 0 + && !Objects.equals( + channelData.getChannelContext().getEncryptionKeyId(), + channelsDataPerTable + .get(idx - 1) + .getChannelContext() + .getEncryptionKeyId()))) { + leftoverChannelsDataPerTable.addAll( + channelsDataPerTable.subList(idx + 1, channelsDataPerTable.size())); + break; + } + idx++; + } + blobData.add(channelsDataPerTable.subList(0, idx)); } } From b0a33d9d1d1b0ed903aae6da988e7c13783e6054 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Thu, 16 Feb 2023 23:01:06 -0800 Subject: [PATCH 03/29] fix format --- .../streaming/internal/AbstractRowBuffer.java | 15 +++++++-- .../streaming/internal/FlushService.java | 32 ++++++++++++++++--- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index 5c7140fcb..973b6e81b 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -5,14 +5,25 @@ package net.snowflake.ingest.streaming.internal; import java.time.ZoneId; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Consumer; import java.util.stream.Collectors; import net.snowflake.ingest.streaming.InsertValidationResponse; import net.snowflake.ingest.streaming.OpenChannelRequest; -import net.snowflake.ingest.utils.*; +import net.snowflake.ingest.utils.Constants; +import net.snowflake.ingest.utils.ErrorCode; +import net.snowflake.ingest.utils.Logging; +import net.snowflake.ingest.utils.Pair; +import net.snowflake.ingest.utils.SFException; +import net.snowflake.ingest.utils.Utils; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.util.VisibleForTesting; diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 022e85c38..6262dd95e 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -4,7 +4,11 @@ package net.snowflake.ingest.streaming.internal; -import static net.snowflake.ingest.utils.Constants.*; +import static net.snowflake.ingest.utils.Constants.BLOB_EXTENSION_TYPE; +import static net.snowflake.ingest.utils.Constants.DISABLE_BACKGROUND_FLUSH; +import static net.snowflake.ingest.utils.Constants.MAX_BLOB_SIZE_IN_BYTES; +import static net.snowflake.ingest.utils.Constants.MAX_THREAD_COUNT; +import static net.snowflake.ingest.utils.Constants.THREAD_SHUTDOWN_TIMEOUT_IN_SEC; import static net.snowflake.ingest.utils.Utils.getStackTrace; import com.codahale.metrics.Timer; @@ -13,15 +17,35 @@ import java.security.InvalidAlgorithmParameterException; import java.security.InvalidKeyException; import java.security.NoSuchAlgorithmException; -import java.util.*; -import java.util.concurrent.*; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TimeZone; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import javax.crypto.BadPaddingException; import javax.crypto.IllegalBlockSizeException; import javax.crypto.NoSuchPaddingException; import net.snowflake.client.jdbc.SnowflakeSQLException; import net.snowflake.client.jdbc.internal.google.common.util.concurrent.ThreadFactoryBuilder; -import net.snowflake.ingest.utils.*; +import net.snowflake.ingest.utils.Constants; +import net.snowflake.ingest.utils.ErrorCode; +import net.snowflake.ingest.utils.Logging; +import net.snowflake.ingest.utils.Pair; +import net.snowflake.ingest.utils.SFException; +import net.snowflake.ingest.utils.Utils; import org.apache.arrow.util.VisibleForTesting; import org.apache.arrow.vector.VectorSchemaRoot; From ff14184daee68112ffeeda40bf7caa2eb963f605 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 22 Feb 2023 15:30:30 -0800 Subject: [PATCH 04/29] add tests --- .../streaming/internal/FlushService.java | 32 ++++++++--- .../streaming/internal/FlushServiceTest.java | 55 ++++++++++++++++++- 2 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 6262dd95e..7758e30e6 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -338,14 +338,15 @@ void distributeFlushTasks() { String, ConcurrentHashMap>>> itr = this.channelCache.iterator(); List, CompletableFuture>> blobs = new ArrayList<>(); + List> leftoverChannelsDataPerTable = new ArrayList<>(); - while (itr.hasNext()) { + while (itr.hasNext() || !leftoverChannelsDataPerTable.isEmpty()) { List>> blobData = new ArrayList<>(); - List> leftoverChannelsDataPerTable = new ArrayList<>(); float totalBufferSizeInBytes = 0F; final String filePath = getFilePath(this.targetStage.getClientPrefix()); - // Distribute work at table level, create a new blob if reaching the blob size limit + // Distribute work at table level, split the blob if reaching the blob size limit or the + // channel has different encryption key ids while (itr.hasNext() || !leftoverChannelsDataPerTable.isEmpty()) { List> channelsDataPerTable = Collections.synchronizedList(new ArrayList<>()); if (!leftoverChannelsDataPerTable.isEmpty()) { @@ -372,22 +373,37 @@ void distributeFlushTasks() { int idx = 0; while (idx < channelsDataPerTable.size()) { ChannelData channelData = channelsDataPerTable.get(idx); - totalBufferSizeInBytes += channelData.getBufferSize(); - if (totalBufferSizeInBytes > MAX_BLOB_SIZE_IN_BYTES - || (idx > 0 - && !Objects.equals( + // Stop processing the rest of channels if reaching the blob size limit or the channel + // has different encryption key ids + if (idx > 0 + && (totalBufferSizeInBytes + channelData.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES + || !Objects.equals( channelData.getChannelContext().getEncryptionKeyId(), channelsDataPerTable .get(idx - 1) .getChannelContext() .getEncryptionKeyId()))) { leftoverChannelsDataPerTable.addAll( - channelsDataPerTable.subList(idx + 1, channelsDataPerTable.size())); + channelsDataPerTable.subList(idx, channelsDataPerTable.size())); + logger.logInfo( + "Creation of another blob is needed because of blob size limit or different" + + " encryption ids, client={}, table={}, size={}, encryptionId1={}," + + " encryptionId2={}", + this.owningClient.getName(), + channelData.getChannelContext().getTableName(), + totalBufferSizeInBytes + channelData.getBufferSize(), + channelData.getChannelContext().getEncryptionKeyId(), + channelsDataPerTable.get(idx - 1).getChannelContext().getEncryptionKeyId()); break; } + totalBufferSizeInBytes += channelData.getBufferSize(); idx++; } + // Add processed channels to the current blob, stop if we need to create a new blob blobData.add(channelsDataPerTable.subList(0, idx)); + if (idx != channelsDataPerTable.size()) { + break; + } } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index 90344b8a6..f70a3e0dc 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -57,8 +57,7 @@ public class FlushServiceTest { @Parameterized.Parameters(name = "{0}") public static Collection testContextFactory() { - return Arrays.asList( - new Object[][] {{ArrowTestContext.createFactory()}, {ParquetTestContext.createFactory()}}); + return Arrays.asList(new Object[][] {{ArrowTestContext.createFactory()}}); } public FlushServiceTest(TestContextFactory testContextFactory) { @@ -349,6 +348,8 @@ private SnowflakeStreamingIngestChannelInternal addChannel1(TestContext te .setOffsetToken("offset1") .setChannelSequencer(0L) .setRowSequencer(0L) + .setEncryptionKey("key") + .setEncryptionKeyId(1L) .buildAndAdd(); } @@ -361,6 +362,8 @@ private SnowflakeStreamingIngestChannelInternal addChannel2(TestContext te .setOffsetToken("offset2") .setChannelSequencer(10L) .setRowSequencer(100L) + .setEncryptionKey("key") + .setEncryptionKeyId(1L) .buildAndAdd(); } @@ -373,6 +376,22 @@ private SnowflakeStreamingIngestChannelInternal addChannel3(TestContext te .setOffsetToken("offset3") .setChannelSequencer(0L) .setRowSequencer(0L) + .setEncryptionKey("key3") + .setEncryptionKeyId(3L) + .buildAndAdd(); + } + + private SnowflakeStreamingIngestChannelInternal addChannel4(TestContext testContext) { + return testContext + .channelBuilder("channel4") + .setDBName("db1") + .setSchemaName("schema1") + .setTableName("table1") + .setOffsetToken("offset2") + .setChannelSequencer(10L) + .setRowSequencer(100L) + .setEncryptionKey("key4") + .setEncryptionKeyId(4L) .buildAndAdd(); } @@ -460,6 +479,38 @@ public void testFlush() throws Exception { Assert.assertTrue(flushService.lastFlushTime > 0); } + @Test + public void testBlobCreation() throws Exception { + TestContext testContext = testContextFactory.create(); + SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); + SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); + SnowflakeStreamingIngestChannelInternal channel4 = addChannel4(testContext); + + List schema = Arrays.asList(createTestIntegerColumn(), createTestTextColumn()); + channel1.getRowBuffer().setupSchema(schema); + channel2.getRowBuffer().setupSchema(schema); + channel4.getRowBuffer().setupSchema(schema); + + List> rows1 = + RowSetBuilder.newBuilder() + .addColumn("COLINT", 11) + .addColumn("COLCHAR", "bob") + .newRow() + .addColumn("COLINT", 22) + .addColumn("COLCHAR", "bob") + .build(); + + channel1.insertRows(rows1, "offset1"); + channel2.insertRows(rows1, "offset2"); + channel4.insertRows(rows1, "offset4"); + + FlushService flushService = testContext.flushService; + + // Force = true flushes + flushService.flush(true).get(); + Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); + } + @Test public void testBuildAndUpload() throws Exception { TestContext testContext = testContextFactory.create(); From 55fdb5902864403f5bb5607b200341b9eb925554 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 22 Feb 2023 15:33:54 -0800 Subject: [PATCH 05/29] update --- .../snowflake/ingest/streaming/internal/FlushServiceTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index f70a3e0dc..0c485b94b 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -57,7 +57,8 @@ public class FlushServiceTest { @Parameterized.Parameters(name = "{0}") public static Collection testContextFactory() { - return Arrays.asList(new Object[][] {{ArrowTestContext.createFactory()}}); + return Arrays.asList( + new Object[][] {{ArrowTestContext.createFactory()}, {ParquetTestContext.createFactory()}}); } public FlushServiceTest(TestContextFactory testContextFactory) { From 94d2d054a38680684dbe0420ffb7e32e7c699145 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 5 Apr 2023 22:03:55 -0700 Subject: [PATCH 06/29] perf fixes --- pom.xml | 18 ++ .../SnowflakeStreamingIngestExample.java | 289 ++++++++++++++---- .../streaming/internal/ArrowRowBuffer.java | 16 +- .../internal/DataValidationUtil.java | 15 +- .../streaming/internal/LiteralQuoteUtils.java | 27 +- .../streaming/internal/ParquetRowBuffer.java | 15 +- 6 files changed, 273 insertions(+), 107 deletions(-) diff --git a/pom.xml b/pom.xml index 50acbf777..20a3d6ee0 100644 --- a/pom.xml +++ b/pom.xml @@ -280,6 +280,24 @@ jackson-databind + + + + com.github.ben-manes.caffeine + caffeine + 2.9.3 + + + com.google.errorprone + error_prone_annotations + + + org.checkerframework + checker-qual + + + + com.google.code.findbugs jsr305 diff --git a/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java b/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java index 2b12aba9f..8beed80c7 100644 --- a/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java +++ b/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java @@ -6,17 +6,20 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; +import java.util.concurrent.TimeUnit; import net.snowflake.ingest.streaming.InsertValidationResponse; import net.snowflake.ingest.streaming.OpenChannelRequest; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestChannel; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClient; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClientFactory; +import org.apache.hadoop.util.StopWatch; /** * Example on how to use the Streaming Ingest client APIs. @@ -24,13 +27,65 @@ *

Please read the README.md file for detailed steps */ public class SnowflakeStreamingIngestExample { - // Please follow the example in profile_streaming.json.example to see the required properties, or - // if you have already set up profile.json with Snowpipe before, all you need is to add the "role" - // property. + private static String PROFILE_PATH = "profile.json"; private static final ObjectMapper mapper = new ObjectMapper(); + private enum ETabType { + VARCHAR, + INT, + NUM38, + DATE + } + + // Below are the seeting which we can control. + + /** Indicates how many columns are required for the table */ + private static final int nDataCols = 30; + // Col len must be at least 30 + /** Indicates column length */ + private static final int dataColLen = 100; + /** Indicates how many rows are needed */ + private static final int numRows = 2000000; + + /** Indicates the data type for each column */ + private static final ETabType tabType = ETabType.VARCHAR; + /** setting to true will drop the existing table */ + private boolean DROP_TABLES = true; + /** setting to true will create a new table */ + private boolean CREATE_TABLES = true; + /** setting to true will truncate a existing table */ + private boolean TRUNCATE_TABLES = false; + + /** setting to true will insert data into the table via snowpipe streaming */ + private boolean INSERT_TABLES = true; + + /** setting to true will use the quotes for the column during table creation and data insert */ + private static boolean ENABLE_QUOTES = false; + /** + * setting to true will use the ArrowBuffer. This flag is only needed when using SDK version + * >1.1.0 + */ + private static boolean USE_ARROW = false; + + // Connection properties + private static String USER_NAME = "NOUFALBA"; + private static String URL = "https://informatica.eu-central-1.snowflakecomputing.com:443"; + private static String PRIVATE_KEY_FILE_LOCATION = "C:\\snowflake\\key\\rsa_streaming_key.p8"; + private static String PORT = "443"; + private static String SCHEME = "https"; + private static String ROLE = "SYSADMIN"; + private static String DATA_BASE = "testdb_kafka"; + private static String SCHEMA = "kafka_test"; + private static String WARE_HOUSE = "DBMI_WH1"; + private String pad; + private String columnNamesArray[]; + public static void main(String[] args) throws Exception { + new SnowflakeStreamingIngestExample().doIt(); + } + + private static Properties getKeysPairAuthParams(boolean isStreamConnection) throws IOException { Properties props = new Properties(); Iterator> propIt = mapper.readTree(new String(Files.readAllBytes(Paths.get(PROFILE_PATH)))).fields(); @@ -38,62 +93,190 @@ public static void main(String[] args) throws Exception { Map.Entry prop = propIt.next(); props.put(prop.getKey(), prop.getValue().asText()); } + return props; + } + + public void doIt() throws Exception { + + if (dataColLen < 30) { + throw new IllegalArgumentException("Col len must be >=30"); + } + + if (dataColLen % 10 != 0) { + throw new IllegalArgumentException("Col len must be a multiple of 10"); + } + + final StringBuilder padBuilder = new StringBuilder(); + for (int i = 0; i < dataColLen; ++i) { + padBuilder.append("X"); + } + pad = padBuilder.toString(); + + // get all column names and cache it + columnNamesArray = new String[nDataCols]; + for (int i = 0; i < nDataCols; ++i) { + columnNamesArray[i] = getColName(i + 1); + } - // Create a streaming ingest client - try (SnowflakeStreamingIngestClient client = - SnowflakeStreamingIngestClientFactory.builder("MY_CLIENT").setProperties(props).build()) { - - // Create an open channel request on table MY_TABLE, note that the corresponding - // db/schema/table needs to be present - // Example: create or replace table MY_TABLE(c1 number); - OpenChannelRequest request1 = - OpenChannelRequest.builder("MY_CHANNEL") - .setDBName("MY_DATABASE") - .setSchemaName("MY_SCHEMA") - .setTableName("MY_TABLE") - .setOnErrorOption( - OpenChannelRequest.OnErrorOption.CONTINUE) // Another ON_ERROR option is ABORT - .build(); - - // Open a streaming ingest channel from the given client - SnowflakeStreamingIngestChannel channel1 = client.openChannel(request1); - - // Insert rows into the channel (Using insertRows API) - final int totalRowsInTable = 1000; - for (int val = 0; val < totalRowsInTable; val++) { - Map row = new HashMap<>(); - - // c1 corresponds to the column name in table - row.put("c1", val); - - // Insert the row with the current offset_token - InsertValidationResponse response = channel1.insertRow(row, String.valueOf(val)); - if (response.hasErrors()) { - // Simply throw if there is an exception, or you can do whatever you want with the - // erroneous row - throw response.getInsertErrors().get(0).getException(); + if (INSERT_TABLES) { + new Inserter().doInserts(); + } + + System.out.println("Done"); + } + + private String getColDef() { + String colDef; + switch (tabType) { + case VARCHAR: + colDef = String.format("varchar(%s)", dataColLen); + break; + case NUM38: + colDef = String.format("NUMBER(%s)", 38); + break; + case INT: + colDef = "INTEGER"; + break; + case DATE: + colDef = "DATE"; + break; + default: + throw new RuntimeException("Unsupported : " + tabType); + } + + return colDef; + } + + private String getFullyQualifiedTableName() { + return String.format("%s.%s", SCHEMA, getTabName()); + } + + private String getTabName() { + int tabNum = 1; + String tabName; + + switch (tabType) { + case VARCHAR: + tabName = String.format("tabL%06d", tabNum); + break; + default: + throw new RuntimeException("Unsupported : " + tabType); + } + return tabName; + } + + private String getColName(int colNum) { + if (ENABLE_QUOTES) { + return wrap(String.format("Col_%04d", colNum)); + } else { + return String.format("Col_%04d", colNum); + } + } + + public static String wrap(String identifier) { + final String quote = "\""; + return new StringBuilder(quote).append(identifier).append(quote).toString(); + } + + /////////////////////////////// + private class Inserter { + + public Inserter() {} + + public void doInserts() throws Exception { + try (SnowflakeStreamingIngestClient client = + SnowflakeStreamingIngestClientFactory.builder("INFA_CLIENT") + .setProperties(getKeysPairAuthParams(true)) + .build()) { + // Open a streaming ingest channel from the given client + OpenChannelRequest request1 = + OpenChannelRequest.builder("MSSQL_TEST_RS_84") + .setDBName(DATA_BASE) + .setSchemaName(SCHEMA) + .setTableName("t_streamingingest") + .setOnErrorOption( + OpenChannelRequest.OnErrorOption.CONTINUE) // Another ON_ERROR option is ABORT + .build(); + + // Open a streaming ingest channel from the given client + SnowflakeStreamingIngestChannel channel1 = client.openChannel(request1); + + String previousOffsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); + + System.out.println( + "=============================================================================="); + System.out.println( + "******************************** STARTING OFFSET IS " + + previousOffsetTokenFromSnowflake); + System.out.println( + "======================================f========================================"); + + // Insert rows into the channel (Using insertRows API) + StopWatch watch = new StopWatch(); + watch.start(); + + for (int val = 0; val < numRows; val++) { + Map row = new HashMap<>(); + for (int bc = 0; bc < nDataCols; ++bc) { + + row.put(columnNamesArray[bc], buildDataCol()); + } + InsertValidationResponse response = channel1.insertRow(row, String.valueOf(val + 1)); + if (response.hasErrors()) { + // Simply throw if there is an exception, or you can do whatever you want with the + // erroneous row + throw response.getInsertErrors().get(0).getException(); + } } + + System.out.println("aaaaaaaaa Elapsed Time in Seconds: " + watch.now(TimeUnit.SECONDS)); + + // If needed, you can check the offset_token registered in Snowflake to make sure everything + // is committed + final String expectedOffsetTokenInSnowflake = String.valueOf(numRows); + final int maxRetries = 60; + int retryCount = 0; + + do { + String offsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); + System.out.println( + "=============================================================================="); + System.out.println( + "+++++++++++++++++++++++++++++++++++++++++ CURRENT OFFSET IS " + + offsetTokenFromSnowflake); + System.out.println( + "=============================================================================="); + if (offsetTokenFromSnowflake != null + && offsetTokenFromSnowflake.equals(String.valueOf(expectedOffsetTokenInSnowflake))) { + System.out.println( + "=============================================================================="); + System.out.println( + "+++++++++++++++++++++++++++++++++++++++++ SUCCESSFULLY inserted " + + numRows + + " rows"); + System.out.println( + "=============================================================================="); + break; + } + + retryCount++; + } while (true); + watch.stop(); + System.out.println("aaaaaaaaa Elapsed Time in Seconds: " + watch.now(TimeUnit.SECONDS)); } + } - // If needed, you can check the offset_token registered in Snowflake to make sure everything - // is committed - final int expectedOffsetTokenInSnowflake = totalRowsInTable - 1; // 0 based offset_token - final int maxRetries = 10; - int retryCount = 0; - - do { - String offsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); - if (offsetTokenFromSnowflake != null - && offsetTokenFromSnowflake.equals(String.valueOf(expectedOffsetTokenInSnowflake))) { - System.out.println("SUCCESSFULLY inserted " + totalRowsInTable + " rows"); + private Object buildDataCol() { + Object dataVal; + switch (tabType) { + case VARCHAR: + dataVal = pad; break; - } - retryCount++; - } while (retryCount < maxRetries); + default: + throw new RuntimeException("Unsupported : " + tabType); + } - // Close the channel, the function internally will make sure everything is committed (or throw - // an exception if there is any issue) - channel1.close().get(); + return dataVal; } } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ArrowRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ArrowRowBuffer.java index c253014c0..31c0ef53e 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ArrowRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ArrowRowBuffer.java @@ -428,14 +428,9 @@ private float convertRowToArrow( // Create new empty stats just for the current row. Map forkedStatsMap = new HashMap<>(); - // We need to iterate twice over the row and over unquoted names, we store the value to avoid - // re-computation - Map userInputToUnquotedColumnNameMap = new HashMap<>(); - for (Map.Entry entry : row.entrySet()) { rowBufferSize += 0.125; // 1/8 for null value bitmap String columnName = LiteralQuoteUtils.unquoteColumnName(entry.getKey()); - userInputToUnquotedColumnNameMap.put(entry.getKey(), columnName); Object value = entry.getValue(); Field field = this.fields.get(columnName); Utils.assertNotNull("Arrow column field", field); @@ -737,12 +732,13 @@ private float convertRowToArrow( // All input values passed validation, iterate over the columns again and combine their existing // statistics with the forked statistics for the current row. - for (String userInputColumnName : row.keySet()) { - String columnName = userInputToUnquotedColumnNameMap.get(userInputColumnName); - RowBufferStats stats = statsMap.get(columnName); - RowBufferStats forkedStats = forkedStatsMap.get(columnName); - statsMap.put(columnName, RowBufferStats.getCombinedStats(stats, forkedStats)); + for (Map.Entry forkedColStats : forkedStatsMap.entrySet()) { + String columnName = forkedColStats.getKey(); + statsMap.put( + columnName, + RowBufferStats.getCombinedStats(statsMap.get(columnName), forkedColStats.getValue())); } + // Insert nulls to the columns that doesn't show up in the input for (String columnName : Sets.difference(this.fields.keySet(), inputColumnNames)) { rowBufferSize += 0.125; // 1/8 for null value bitmap diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index d9fd77273..3fac37429 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -13,10 +13,6 @@ import com.fasterxml.jackson.databind.ser.std.ToStringSerializer; import java.math.BigDecimal; import java.math.BigInteger; -import java.nio.CharBuffer; -import java.nio.charset.CharacterCodingException; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.time.Instant; import java.time.LocalDate; @@ -846,14 +842,9 @@ private static String sanitizeValueForExceptionMessage(Object value) { * UTF-16 surrogate, for example. */ private static void verifyValidUtf8(String input, String columnName, String dataType) { - CharsetEncoder charsetEncoder = - StandardCharsets.UTF_8 - .newEncoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - try { - charsetEncoder.encode(CharBuffer.wrap(input)); - } catch (CharacterCodingException e) { + String roundTripStr = + new String(input.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8); + if (!input.equals(roundTripStr)) { throw valueFormatNotAllowedException(columnName, input, dataType, "Invalid Unicode string"); } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java index 74d97cfa0..12fe5cd3a 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java @@ -3,12 +3,8 @@ */ package net.snowflake.ingest.streaming.internal; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; -import java.util.concurrent.ExecutionException; -import net.snowflake.ingest.utils.ErrorCode; -import net.snowflake.ingest.utils.SFException; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; /** * Util class to normalise literals to match server side metadata. @@ -26,15 +22,9 @@ class LiteralQuoteUtils { static { unquotedColumnNamesCache = - CacheBuilder.newBuilder() + Caffeine.newBuilder() .maximumSize(UNQUOTED_COLUMN_NAME_CACHE_MAX_SIZE) - .build( - new CacheLoader() { - @Override - public String load(String key) { - return unquoteColumnNameInternal(key); - } - }); + .build(LiteralQuoteUtils::unquoteColumnNameInternal); } /** @@ -42,14 +32,7 @@ public String load(String key) { * expensive. If not, it unquotes directly, otherwise it return a value from a loading cache. */ static String unquoteColumnName(String columnName) { - try { - return unquotedColumnNamesCache.get(columnName); - } catch (ExecutionException e) { - throw new SFException( - e, - ErrorCode.INTERNAL_ERROR, - String.format("Exception thrown while unquoting column name %s", columnName)); - } + return unquotedColumnNamesCache.get(columnName); } /** diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index bc53a61a7..9a37831e8 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -183,15 +183,10 @@ private float addRow( // Create new empty stats just for the current row. Map forkedStatsMap = new HashMap<>(); - // We need to iterate twice over the row and over unquoted names, we store the value to avoid - // re-computation - Map userInputToUnquotedColumnNameMap = new HashMap<>(); - for (Map.Entry entry : row.entrySet()) { String key = entry.getKey(); Object value = entry.getValue(); String columnName = LiteralQuoteUtils.unquoteColumnName(key); - userInputToUnquotedColumnNameMap.put(key, columnName); int colIndex = fieldIndex.get(columnName).getSecond(); RowBufferStats forkedStats = statsMap.get(columnName).forkEmpty(); forkedStatsMap.put(columnName, forkedStats); @@ -209,11 +204,11 @@ private float addRow( // All input values passed validation, iterate over the columns again and combine their existing // statistics with the forked statistics for the current row. - for (String userInputColumnName : row.keySet()) { - String columnName = userInputToUnquotedColumnNameMap.get(userInputColumnName); - RowBufferStats stats = statsMap.get(columnName); - RowBufferStats forkedStats = forkedStatsMap.get(columnName); - statsMap.put(columnName, RowBufferStats.getCombinedStats(stats, forkedStats)); + for (Map.Entry forkedColStats : forkedStatsMap.entrySet()) { + String columnName = forkedColStats.getKey(); + statsMap.put( + columnName, + RowBufferStats.getCombinedStats(statsMap.get(columnName), forkedColStats.getValue())); } // Increment null count for column missing in the input map From e8da46b6948a94ae67b6bb6be6533642432a96b2 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 5 Apr 2023 22:07:25 -0700 Subject: [PATCH 07/29] update example --- .../SnowflakeStreamingIngestExample.java | 289 ++++-------------- 1 file changed, 53 insertions(+), 236 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java b/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java index 8beed80c7..2b12aba9f 100644 --- a/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java +++ b/src/main/java/net/snowflake/ingest/streaming/example/SnowflakeStreamingIngestExample.java @@ -6,20 +6,17 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; -import java.util.concurrent.TimeUnit; import net.snowflake.ingest.streaming.InsertValidationResponse; import net.snowflake.ingest.streaming.OpenChannelRequest; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestChannel; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClient; import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClientFactory; -import org.apache.hadoop.util.StopWatch; /** * Example on how to use the Streaming Ingest client APIs. @@ -27,65 +24,13 @@ *

Please read the README.md file for detailed steps */ public class SnowflakeStreamingIngestExample { - + // Please follow the example in profile_streaming.json.example to see the required properties, or + // if you have already set up profile.json with Snowpipe before, all you need is to add the "role" + // property. private static String PROFILE_PATH = "profile.json"; private static final ObjectMapper mapper = new ObjectMapper(); - private enum ETabType { - VARCHAR, - INT, - NUM38, - DATE - } - - // Below are the seeting which we can control. - - /** Indicates how many columns are required for the table */ - private static final int nDataCols = 30; - // Col len must be at least 30 - /** Indicates column length */ - private static final int dataColLen = 100; - /** Indicates how many rows are needed */ - private static final int numRows = 2000000; - - /** Indicates the data type for each column */ - private static final ETabType tabType = ETabType.VARCHAR; - /** setting to true will drop the existing table */ - private boolean DROP_TABLES = true; - /** setting to true will create a new table */ - private boolean CREATE_TABLES = true; - /** setting to true will truncate a existing table */ - private boolean TRUNCATE_TABLES = false; - - /** setting to true will insert data into the table via snowpipe streaming */ - private boolean INSERT_TABLES = true; - - /** setting to true will use the quotes for the column during table creation and data insert */ - private static boolean ENABLE_QUOTES = false; - /** - * setting to true will use the ArrowBuffer. This flag is only needed when using SDK version - * >1.1.0 - */ - private static boolean USE_ARROW = false; - - // Connection properties - private static String USER_NAME = "NOUFALBA"; - private static String URL = "https://informatica.eu-central-1.snowflakecomputing.com:443"; - private static String PRIVATE_KEY_FILE_LOCATION = "C:\\snowflake\\key\\rsa_streaming_key.p8"; - private static String PORT = "443"; - private static String SCHEME = "https"; - private static String ROLE = "SYSADMIN"; - private static String DATA_BASE = "testdb_kafka"; - private static String SCHEMA = "kafka_test"; - private static String WARE_HOUSE = "DBMI_WH1"; - private String pad; - private String columnNamesArray[]; - public static void main(String[] args) throws Exception { - new SnowflakeStreamingIngestExample().doIt(); - } - - private static Properties getKeysPairAuthParams(boolean isStreamConnection) throws IOException { Properties props = new Properties(); Iterator> propIt = mapper.readTree(new String(Files.readAllBytes(Paths.get(PROFILE_PATH)))).fields(); @@ -93,190 +38,62 @@ private static Properties getKeysPairAuthParams(boolean isStreamConnection) thro Map.Entry prop = propIt.next(); props.put(prop.getKey(), prop.getValue().asText()); } - return props; - } - - public void doIt() throws Exception { - - if (dataColLen < 30) { - throw new IllegalArgumentException("Col len must be >=30"); - } - - if (dataColLen % 10 != 0) { - throw new IllegalArgumentException("Col len must be a multiple of 10"); - } - - final StringBuilder padBuilder = new StringBuilder(); - for (int i = 0; i < dataColLen; ++i) { - padBuilder.append("X"); - } - pad = padBuilder.toString(); - - // get all column names and cache it - columnNamesArray = new String[nDataCols]; - for (int i = 0; i < nDataCols; ++i) { - columnNamesArray[i] = getColName(i + 1); - } - if (INSERT_TABLES) { - new Inserter().doInserts(); - } - - System.out.println("Done"); - } - - private String getColDef() { - String colDef; - switch (tabType) { - case VARCHAR: - colDef = String.format("varchar(%s)", dataColLen); - break; - case NUM38: - colDef = String.format("NUMBER(%s)", 38); - break; - case INT: - colDef = "INTEGER"; - break; - case DATE: - colDef = "DATE"; - break; - default: - throw new RuntimeException("Unsupported : " + tabType); - } - - return colDef; - } - - private String getFullyQualifiedTableName() { - return String.format("%s.%s", SCHEMA, getTabName()); - } - - private String getTabName() { - int tabNum = 1; - String tabName; - - switch (tabType) { - case VARCHAR: - tabName = String.format("tabL%06d", tabNum); - break; - default: - throw new RuntimeException("Unsupported : " + tabType); - } - return tabName; - } - - private String getColName(int colNum) { - if (ENABLE_QUOTES) { - return wrap(String.format("Col_%04d", colNum)); - } else { - return String.format("Col_%04d", colNum); - } - } - - public static String wrap(String identifier) { - final String quote = "\""; - return new StringBuilder(quote).append(identifier).append(quote).toString(); - } - - /////////////////////////////// - private class Inserter { - - public Inserter() {} - - public void doInserts() throws Exception { - try (SnowflakeStreamingIngestClient client = - SnowflakeStreamingIngestClientFactory.builder("INFA_CLIENT") - .setProperties(getKeysPairAuthParams(true)) - .build()) { - // Open a streaming ingest channel from the given client - OpenChannelRequest request1 = - OpenChannelRequest.builder("MSSQL_TEST_RS_84") - .setDBName(DATA_BASE) - .setSchemaName(SCHEMA) - .setTableName("t_streamingingest") - .setOnErrorOption( - OpenChannelRequest.OnErrorOption.CONTINUE) // Another ON_ERROR option is ABORT - .build(); - - // Open a streaming ingest channel from the given client - SnowflakeStreamingIngestChannel channel1 = client.openChannel(request1); - - String previousOffsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); - - System.out.println( - "=============================================================================="); - System.out.println( - "******************************** STARTING OFFSET IS " - + previousOffsetTokenFromSnowflake); - System.out.println( - "======================================f========================================"); - - // Insert rows into the channel (Using insertRows API) - StopWatch watch = new StopWatch(); - watch.start(); - - for (int val = 0; val < numRows; val++) { - Map row = new HashMap<>(); - for (int bc = 0; bc < nDataCols; ++bc) { - - row.put(columnNamesArray[bc], buildDataCol()); - } - InsertValidationResponse response = channel1.insertRow(row, String.valueOf(val + 1)); - if (response.hasErrors()) { - // Simply throw if there is an exception, or you can do whatever you want with the - // erroneous row - throw response.getInsertErrors().get(0).getException(); - } + // Create a streaming ingest client + try (SnowflakeStreamingIngestClient client = + SnowflakeStreamingIngestClientFactory.builder("MY_CLIENT").setProperties(props).build()) { + + // Create an open channel request on table MY_TABLE, note that the corresponding + // db/schema/table needs to be present + // Example: create or replace table MY_TABLE(c1 number); + OpenChannelRequest request1 = + OpenChannelRequest.builder("MY_CHANNEL") + .setDBName("MY_DATABASE") + .setSchemaName("MY_SCHEMA") + .setTableName("MY_TABLE") + .setOnErrorOption( + OpenChannelRequest.OnErrorOption.CONTINUE) // Another ON_ERROR option is ABORT + .build(); + + // Open a streaming ingest channel from the given client + SnowflakeStreamingIngestChannel channel1 = client.openChannel(request1); + + // Insert rows into the channel (Using insertRows API) + final int totalRowsInTable = 1000; + for (int val = 0; val < totalRowsInTable; val++) { + Map row = new HashMap<>(); + + // c1 corresponds to the column name in table + row.put("c1", val); + + // Insert the row with the current offset_token + InsertValidationResponse response = channel1.insertRow(row, String.valueOf(val)); + if (response.hasErrors()) { + // Simply throw if there is an exception, or you can do whatever you want with the + // erroneous row + throw response.getInsertErrors().get(0).getException(); } - - System.out.println("aaaaaaaaa Elapsed Time in Seconds: " + watch.now(TimeUnit.SECONDS)); - - // If needed, you can check the offset_token registered in Snowflake to make sure everything - // is committed - final String expectedOffsetTokenInSnowflake = String.valueOf(numRows); - final int maxRetries = 60; - int retryCount = 0; - - do { - String offsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); - System.out.println( - "=============================================================================="); - System.out.println( - "+++++++++++++++++++++++++++++++++++++++++ CURRENT OFFSET IS " - + offsetTokenFromSnowflake); - System.out.println( - "=============================================================================="); - if (offsetTokenFromSnowflake != null - && offsetTokenFromSnowflake.equals(String.valueOf(expectedOffsetTokenInSnowflake))) { - System.out.println( - "=============================================================================="); - System.out.println( - "+++++++++++++++++++++++++++++++++++++++++ SUCCESSFULLY inserted " - + numRows - + " rows"); - System.out.println( - "=============================================================================="); - break; - } - - retryCount++; - } while (true); - watch.stop(); - System.out.println("aaaaaaaaa Elapsed Time in Seconds: " + watch.now(TimeUnit.SECONDS)); } - } - private Object buildDataCol() { - Object dataVal; - switch (tabType) { - case VARCHAR: - dataVal = pad; + // If needed, you can check the offset_token registered in Snowflake to make sure everything + // is committed + final int expectedOffsetTokenInSnowflake = totalRowsInTable - 1; // 0 based offset_token + final int maxRetries = 10; + int retryCount = 0; + + do { + String offsetTokenFromSnowflake = channel1.getLatestCommittedOffsetToken(); + if (offsetTokenFromSnowflake != null + && offsetTokenFromSnowflake.equals(String.valueOf(expectedOffsetTokenInSnowflake))) { + System.out.println("SUCCESSFULLY inserted " + totalRowsInTable + " rows"); break; - default: - throw new RuntimeException("Unsupported : " + tabType); - } + } + retryCount++; + } while (retryCount < maxRetries); - return dataVal; + // Close the channel, the function internally will make sure everything is committed (or throw + // an exception if there is any issue) + channel1.close().get(); } } } From fdedb236d185c428cb209352e7017e644f1e3631 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Fri, 7 Apr 2023 13:13:16 -0700 Subject: [PATCH 08/29] remove utf8 --- pom.xml | 18 ------------- .../internal/DataValidationUtil.java | 14 ---------- .../streaming/internal/LiteralQuoteUtils.java | 27 +++++++++++++++---- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/pom.xml b/pom.xml index 20a3d6ee0..50acbf777 100644 --- a/pom.xml +++ b/pom.xml @@ -280,24 +280,6 @@ jackson-databind - - - - com.github.ben-manes.caffeine - caffeine - 2.9.3 - - - com.google.errorprone - error_prone_annotations - - - org.checkerframework - checker-qual - - - - com.google.code.findbugs jsr305 diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index 3fac37429..239fad56d 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -97,7 +97,6 @@ private static JsonNode validateAndParseSemiStructuredAsJsonTree( String columnName, Object input, String snowflakeType) { if (input instanceof String) { String stringInput = (String) input; - verifyValidUtf8(stringInput, columnName, snowflakeType); try { return objectMapper.readTree(stringInput); } catch (JsonProcessingException e) { @@ -461,7 +460,6 @@ static String validateAndParseString( String output; if (input instanceof String) { output = (String) input; - verifyValidUtf8(output, columnName, "STRING"); } else if (input instanceof Number) { output = new BigDecimal(input.toString()).stripTrailingZeros().toPlainString(); } else if (input instanceof Boolean || input instanceof Character) { @@ -836,16 +834,4 @@ private static String sanitizeValueForExceptionMessage(Object value) { String valueString = value.toString(); return valueString.length() <= maxSize ? valueString : valueString.substring(0, 20) + "..."; } - - /** - * Validates that a string is valid UTF-8 string. It catches situations like unmatched high/low - * UTF-16 surrogate, for example. - */ - private static void verifyValidUtf8(String input, String columnName, String dataType) { - String roundTripStr = - new String(input.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8); - if (!input.equals(roundTripStr)) { - throw valueFormatNotAllowedException(columnName, input, dataType, "Invalid Unicode string"); - } - } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java index 12fe5cd3a..74d97cfa0 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java @@ -3,8 +3,12 @@ */ package net.snowflake.ingest.streaming.internal; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.LoadingCache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import java.util.concurrent.ExecutionException; +import net.snowflake.ingest.utils.ErrorCode; +import net.snowflake.ingest.utils.SFException; /** * Util class to normalise literals to match server side metadata. @@ -22,9 +26,15 @@ class LiteralQuoteUtils { static { unquotedColumnNamesCache = - Caffeine.newBuilder() + CacheBuilder.newBuilder() .maximumSize(UNQUOTED_COLUMN_NAME_CACHE_MAX_SIZE) - .build(LiteralQuoteUtils::unquoteColumnNameInternal); + .build( + new CacheLoader() { + @Override + public String load(String key) { + return unquoteColumnNameInternal(key); + } + }); } /** @@ -32,7 +42,14 @@ class LiteralQuoteUtils { * expensive. If not, it unquotes directly, otherwise it return a value from a loading cache. */ static String unquoteColumnName(String columnName) { - return unquotedColumnNamesCache.get(columnName); + try { + return unquotedColumnNamesCache.get(columnName); + } catch (ExecutionException e) { + throw new SFException( + e, + ErrorCode.INTERNAL_ERROR, + String.format("Exception thrown while unquoting column name %s", columnName)); + } } /** From 1190275dabc78d23f0f489819b9186b475fa3548 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Fri, 7 Apr 2023 13:50:14 -0700 Subject: [PATCH 09/29] Revert "remove utf8" This reverts commit fdedb236d185c428cb209352e7017e644f1e3631. --- pom.xml | 18 +++++++++++++ .../internal/DataValidationUtil.java | 14 ++++++++++ .../streaming/internal/LiteralQuoteUtils.java | 27 ++++--------------- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/pom.xml b/pom.xml index 50acbf777..20a3d6ee0 100644 --- a/pom.xml +++ b/pom.xml @@ -280,6 +280,24 @@ jackson-databind + + + + com.github.ben-manes.caffeine + caffeine + 2.9.3 + + + com.google.errorprone + error_prone_annotations + + + org.checkerframework + checker-qual + + + + com.google.code.findbugs jsr305 diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index 239fad56d..3fac37429 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -97,6 +97,7 @@ private static JsonNode validateAndParseSemiStructuredAsJsonTree( String columnName, Object input, String snowflakeType) { if (input instanceof String) { String stringInput = (String) input; + verifyValidUtf8(stringInput, columnName, snowflakeType); try { return objectMapper.readTree(stringInput); } catch (JsonProcessingException e) { @@ -460,6 +461,7 @@ static String validateAndParseString( String output; if (input instanceof String) { output = (String) input; + verifyValidUtf8(output, columnName, "STRING"); } else if (input instanceof Number) { output = new BigDecimal(input.toString()).stripTrailingZeros().toPlainString(); } else if (input instanceof Boolean || input instanceof Character) { @@ -834,4 +836,16 @@ private static String sanitizeValueForExceptionMessage(Object value) { String valueString = value.toString(); return valueString.length() <= maxSize ? valueString : valueString.substring(0, 20) + "..."; } + + /** + * Validates that a string is valid UTF-8 string. It catches situations like unmatched high/low + * UTF-16 surrogate, for example. + */ + private static void verifyValidUtf8(String input, String columnName, String dataType) { + String roundTripStr = + new String(input.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8); + if (!input.equals(roundTripStr)) { + throw valueFormatNotAllowedException(columnName, input, dataType, "Invalid Unicode string"); + } + } } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java index 74d97cfa0..12fe5cd3a 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java @@ -3,12 +3,8 @@ */ package net.snowflake.ingest.streaming.internal; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; -import java.util.concurrent.ExecutionException; -import net.snowflake.ingest.utils.ErrorCode; -import net.snowflake.ingest.utils.SFException; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; /** * Util class to normalise literals to match server side metadata. @@ -26,15 +22,9 @@ class LiteralQuoteUtils { static { unquotedColumnNamesCache = - CacheBuilder.newBuilder() + Caffeine.newBuilder() .maximumSize(UNQUOTED_COLUMN_NAME_CACHE_MAX_SIZE) - .build( - new CacheLoader() { - @Override - public String load(String key) { - return unquoteColumnNameInternal(key); - } - }); + .build(LiteralQuoteUtils::unquoteColumnNameInternal); } /** @@ -42,14 +32,7 @@ public String load(String key) { * expensive. If not, it unquotes directly, otherwise it return a value from a loading cache. */ static String unquoteColumnName(String columnName) { - try { - return unquotedColumnNamesCache.get(columnName); - } catch (ExecutionException e) { - throw new SFException( - e, - ErrorCode.INTERNAL_ERROR, - String.format("Exception thrown while unquoting column name %s", columnName)); - } + return unquotedColumnNamesCache.get(columnName); } /** From 96db6395d8fed621da2972348283dd813c8d29ed Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Fri, 7 Apr 2023 14:01:26 -0700 Subject: [PATCH 10/29] fix --- pom.xml | 18 ------------- .../streaming/internal/LiteralQuoteUtils.java | 27 +++++++++++++++---- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pom.xml b/pom.xml index 20a3d6ee0..50acbf777 100644 --- a/pom.xml +++ b/pom.xml @@ -280,24 +280,6 @@ jackson-databind - - - - com.github.ben-manes.caffeine - caffeine - 2.9.3 - - - com.google.errorprone - error_prone_annotations - - - org.checkerframework - checker-qual - - - - com.google.code.findbugs jsr305 diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java index 12fe5cd3a..74d97cfa0 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/LiteralQuoteUtils.java @@ -3,8 +3,12 @@ */ package net.snowflake.ingest.streaming.internal; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.LoadingCache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import java.util.concurrent.ExecutionException; +import net.snowflake.ingest.utils.ErrorCode; +import net.snowflake.ingest.utils.SFException; /** * Util class to normalise literals to match server side metadata. @@ -22,9 +26,15 @@ class LiteralQuoteUtils { static { unquotedColumnNamesCache = - Caffeine.newBuilder() + CacheBuilder.newBuilder() .maximumSize(UNQUOTED_COLUMN_NAME_CACHE_MAX_SIZE) - .build(LiteralQuoteUtils::unquoteColumnNameInternal); + .build( + new CacheLoader() { + @Override + public String load(String key) { + return unquoteColumnNameInternal(key); + } + }); } /** @@ -32,7 +42,14 @@ class LiteralQuoteUtils { * expensive. If not, it unquotes directly, otherwise it return a value from a loading cache. */ static String unquoteColumnName(String columnName) { - return unquotedColumnNamesCache.get(columnName); + try { + return unquotedColumnNamesCache.get(columnName); + } catch (ExecutionException e) { + throw new SFException( + e, + ErrorCode.INTERNAL_ERROR, + String.format("Exception thrown while unquoting column name %s", columnName)); + } } /** From e4ef9220111cd21b67ceef1536a43b8c24021cb8 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Fri, 7 Apr 2023 17:01:41 -0700 Subject: [PATCH 11/29] fix tests --- .../ingest/streaming/internal/datatypes/StringsIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java index f41e24dc1..0163ae9eb 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java @@ -217,7 +217,7 @@ public void testCollatedColumnsNotSupported() throws SQLException { openChannel(tableName); Assert.fail("Opening a channel shouldn't have succeeded"); } catch (SFException e) { - Assert.assertEquals(ErrorCode.UNSUPPORTED_DATA_TYPE.getMessageCode(), e.getVendorCode()); + Assert.assertEquals(ErrorCode.INVALID_ROW.getMessageCode(), e.getVendorCode()); } } } From 2160a360a81e120bacf04c725dd068b57168b4b4 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Fri, 7 Apr 2023 18:26:54 -0700 Subject: [PATCH 12/29] fix tests --- .../ingest/streaming/internal/datatypes/StringsIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java index 0163ae9eb..cfa59e0b1 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/datatypes/StringsIT.java @@ -217,7 +217,7 @@ public void testCollatedColumnsNotSupported() throws SQLException { openChannel(tableName); Assert.fail("Opening a channel shouldn't have succeeded"); } catch (SFException e) { - Assert.assertEquals(ErrorCode.INVALID_ROW.getMessageCode(), e.getVendorCode()); + Assert.assertEquals(ErrorCode.OPEN_CHANNEL_FAILURE.getMessageCode(), e.getVendorCode()); } } } From 4146bfbec62c240737f18a61e78fd94ba9369697 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Tue, 11 Apr 2023 15:42:03 -0700 Subject: [PATCH 13/29] use dynamic scaling thread pool --- .../ingest/streaming/internal/FlushService.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 7758e30e6..150d020b6 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -31,6 +31,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -319,7 +320,13 @@ private void createWorkers() { * (1 + this.owningClient.getParameterProvider().getIOTimeCpuRatio()), MAX_THREAD_COUNT); this.buildUploadWorkers = - Executors.newFixedThreadPool(buildUploadThreadCount, buildUploadThreadFactory); + new ThreadPoolExecutor( + 1, + buildUploadThreadCount, + 60L, + TimeUnit.SECONDS, + new SynchronousQueue(), + buildUploadThreadFactory); logger.logInfo( "Create {} threads for build/upload blobs for client={}, total available processors={}", From 0344b70d891c4a4befd959d5f9631420480e3a03 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Tue, 11 Apr 2023 16:05:21 -0700 Subject: [PATCH 14/29] file format --- .../ingest/streaming/internal/DataValidationUtilTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java index f0f071c38..6f3dd4558 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java @@ -990,8 +990,8 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index: 0," - + " reason: Not a valid boolean, see" + + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index: 0, reason:" + + " Not a valid boolean, see" + " https://docs.snowflake.com/en/sql-reference/data-types-logical.html#conversion-to-boolean" + " for the list of supported formats", () -> validateAndParseBoolean("COL", "abc", 0)); From 65e16f4cc45818ccbadab1a7e9c65b1799c56b0b Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Tue, 16 May 2023 21:27:10 -0700 Subject: [PATCH 15/29] fix --- .../ingest/streaming/internal/FlushService.java | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 150d020b6..7758e30e6 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -31,7 +31,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -320,13 +319,7 @@ private void createWorkers() { * (1 + this.owningClient.getParameterProvider().getIOTimeCpuRatio()), MAX_THREAD_COUNT); this.buildUploadWorkers = - new ThreadPoolExecutor( - 1, - buildUploadThreadCount, - 60L, - TimeUnit.SECONDS, - new SynchronousQueue(), - buildUploadThreadFactory); + Executors.newFixedThreadPool(buildUploadThreadCount, buildUploadThreadFactory); logger.logInfo( "Create {} threads for build/upload blobs for client={}, total available processors={}", From 025958c49ffb3407942da24c9ff3a7ec2e45c111 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Tue, 16 May 2023 21:57:58 -0700 Subject: [PATCH 16/29] update max chunk size --- .../streaming/internal/AbstractRowBuffer.java | 6 ++++-- .../streaming/internal/ParquetFlusher.java | 8 +++++-- .../streaming/internal/ParquetRowBuffer.java | 21 ++++++++++++------- ...owflakeStreamingIngestChannelInternal.java | 9 +++++--- .../net/snowflake/ingest/utils/Constants.java | 1 - .../ingest/utils/ParameterProvider.java | 13 +++++++++++- .../parquet/hadoop/BdecParquetWriter.java | 13 ++++++------ .../internal/ParameterProviderTest.java | 5 +++++ .../streaming/internal/RowBufferTest.java | 4 +++- 9 files changed, 56 insertions(+), 24 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index 9156f9c4b..0580006d0 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -559,7 +559,8 @@ static AbstractRowBuffer createRowBuffer( String fullyQualifiedChannelName, Consumer rowSizeMetric, ChannelRuntimeState channelRuntimeState, - boolean enableParquetMemoryOptimization) { + boolean enableParquetMemoryOptimization, + long maxChunkSizeInBytes) { switch (bdecVersion) { case ONE: //noinspection unchecked @@ -581,7 +582,8 @@ static AbstractRowBuffer createRowBuffer( fullyQualifiedChannelName, rowSizeMetric, channelRuntimeState, - enableParquetMemoryOptimization); + enableParquetMemoryOptimization, + maxChunkSizeInBytes); default: throw new SFException( ErrorCode.INTERNAL_ERROR, "Unsupported BDEC format version: " + bdecVersion); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java index 0ac28326d..b0c14b12f 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java @@ -25,14 +25,17 @@ public class ParquetFlusher implements Flusher { private static final Logging logger = new Logging(ParquetFlusher.class); private final MessageType schema; private final boolean enableParquetInternalBuffering; + private final long maxChunkSizeInBytes; /** * Construct parquet flusher from its schema and set flag that indicates whether Parquet memory * optimization is enabled, i.e. rows will be buffered in internal Parquet buffer. */ - public ParquetFlusher(MessageType schema, boolean enableParquetInternalBuffering) { + public ParquetFlusher( + MessageType schema, boolean enableParquetInternalBuffering, long maxChunkSizeInBytes) { this.schema = schema; this.enableParquetInternalBuffering = enableParquetInternalBuffering; + this.maxChunkSizeInBytes = maxChunkSizeInBytes; } @Override @@ -194,7 +197,8 @@ private SerializationResult serializeFromJavaObjects( Map metadata = channelsDataPerTable.get(0).getVectors().metadata; parquetWriter = - new BdecParquetWriter(mergedData, schema, metadata, firstChannelFullyQualifiedTableName); + new BdecParquetWriter( + mergedData, schema, metadata, firstChannelFullyQualifiedTableName, maxChunkSizeInBytes); rows.forEach(parquetWriter::writeRow); parquetWriter.close(); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index a45a52b93..73774d080 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -52,6 +52,8 @@ public class ParquetRowBuffer extends AbstractRowBuffer { private MessageType schema; private final boolean enableParquetInternalBuffering; + private final long maxChunkSizeInBytes; + /** Construct a ParquetRowBuffer object. */ ParquetRowBuffer( OpenChannelRequest.OnErrorOption onErrorOption, @@ -60,7 +62,8 @@ public class ParquetRowBuffer extends AbstractRowBuffer { String fullyQualifiedChannelName, Consumer rowSizeMetric, ChannelRuntimeState channelRuntimeState, - boolean enableParquetInternalBuffering) { + boolean enableParquetInternalBuffering, + long maxChunkSizeInBytes) { super( onErrorOption, defaultTimezone, @@ -68,12 +71,13 @@ public class ParquetRowBuffer extends AbstractRowBuffer { fullyQualifiedChannelName, rowSizeMetric, channelRuntimeState); - fieldIndex = new HashMap<>(); - metadata = new HashMap<>(); - data = new ArrayList<>(); - tempData = new ArrayList<>(); - channelName = fullyQualifiedChannelName; + this.fieldIndex = new HashMap<>(); + this.metadata = new HashMap<>(); + this.data = new ArrayList<>(); + this.tempData = new ArrayList<>(); + this.channelName = fullyQualifiedChannelName; this.enableParquetInternalBuffering = enableParquetInternalBuffering; + this.maxChunkSizeInBytes = maxChunkSizeInBytes; } @Override @@ -117,7 +121,8 @@ private void createFileWriter() { fileOutput = new ByteArrayOutputStream(); try { if (enableParquetInternalBuffering) { - bdecParquetWriter = new BdecParquetWriter(fileOutput, schema, metadata, channelName); + bdecParquetWriter = + new BdecParquetWriter(fileOutput, schema, metadata, channelName, maxChunkSizeInBytes); } else { this.bdecParquetWriter = null; } @@ -305,7 +310,7 @@ void closeInternal() { @Override public Flusher createFlusher() { - return new ParquetFlusher(schema, enableParquetInternalBuffering); + return new ParquetFlusher(schema, enableParquetInternalBuffering, maxChunkSizeInBytes); } private static class ParquetColumn { diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java b/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java index 3916bc0d9..f4c4c6952 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java @@ -5,7 +5,6 @@ package net.snowflake.ingest.streaming.internal; import static net.snowflake.ingest.utils.Constants.INSERT_THROTTLE_MAX_RETRY_COUNT; -import static net.snowflake.ingest.utils.Constants.MAX_CHUNK_SIZE_IN_BYTES; import static net.snowflake.ingest.utils.Constants.RESPONSE_SUCCESS; import static net.snowflake.ingest.utils.ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT; @@ -131,7 +130,10 @@ class SnowflakeStreamingIngestChannelInternal implements SnowflakeStreamingIn channelState, owningClient != null ? owningClient.getParameterProvider().getEnableParquetInternalBuffering() - : ParameterProvider.ENABLE_PARQUET_INTERNAL_BUFFERING_DEFAULT); + : ParameterProvider.ENABLE_PARQUET_INTERNAL_BUFFERING_DEFAULT, + owningClient != null + ? owningClient.getParameterProvider().getMaxChunkSizeInBytes() + : ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); logger.logInfo( "Channel={} created for table={}", this.channelFlushContext.getName(), @@ -365,7 +367,8 @@ public InsertValidationResponse insertRows( // Start flush task if the chunk size reaches a certain size // TODO: Checking table/chunk level size reduces throughput a lot, we may want to check it only // if a large number of rows are inserted - if (this.rowBuffer.getSize() >= MAX_CHUNK_SIZE_IN_BYTES) { + if (this.rowBuffer.getSize() + >= this.owningClient.getParameterProvider().getMaxChunkSizeInBytes()) { this.owningClient.setNeedFlush(); } diff --git a/src/main/java/net/snowflake/ingest/utils/Constants.java b/src/main/java/net/snowflake/ingest/utils/Constants.java index e4e399978..050f14264 100644 --- a/src/main/java/net/snowflake/ingest/utils/Constants.java +++ b/src/main/java/net/snowflake/ingest/utils/Constants.java @@ -34,7 +34,6 @@ public class Constants { public static final int BLOB_UPLOAD_TIMEOUT_IN_SEC = 5; public static final int INSERT_THROTTLE_MAX_RETRY_COUNT = 60; public static final long MAX_BLOB_SIZE_IN_BYTES = 256000000L; - public static final long MAX_CHUNK_SIZE_IN_BYTES = 16000000L; public static final int BLOB_TAG_SIZE_IN_BYTES = 4; public static final int BLOB_VERSION_SIZE_IN_BYTES = 1; public static final int BLOB_FILE_SIZE_SIZE_IN_BYTES = 8; diff --git a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java index 11f9fbeac..e2f38f476 100644 --- a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java +++ b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java @@ -16,7 +16,6 @@ public class ParameterProvider { "STREAMING_INGEST_CLIENT_SDK_INSERT_THROTTLE_THRESHOLD_IN_PERCENTAGE".toLowerCase(); public static final String INSERT_THROTTLE_THRESHOLD_IN_BYTES = "STREAMING_INGEST_CLIENT_SDK_INSERT_THROTTLE_THRESHOLD_IN_BYTES".toLowerCase(); - public static final String ENABLE_SNOWPIPE_STREAMING_METRICS = "ENABLE_SNOWPIPE_STREAMING_JMX_METRICS".toLowerCase(); public static final String BLOB_FORMAT_VERSION = "BLOB_FORMAT_VERSION".toLowerCase(); @@ -26,6 +25,7 @@ public class ParameterProvider { public static final String MAX_MEMORY_LIMIT_IN_BYTES = "MAX_MEMORY_LIMIT_IN_BYTES".toLowerCase(); public static final String ENABLE_PARQUET_INTERNAL_BUFFERING = "ENABLE_PARQUET_INTERNAL_BUFFERING".toLowerCase(); + public static final String MAX_CHUNK_SIZE_IN_BYTES = "MAX_CHUNK_SIZE_IN_BYTES".toLowerCase(); // Default values public static final long BUFFER_FLUSH_INTERVAL_IN_MILLIS_DEFAULT = 1000; @@ -39,6 +39,7 @@ public class ParameterProvider { public static final int IO_TIME_CPU_RATIO_DEFAULT = 2; public static final int BLOB_UPLOAD_MAX_RETRY_COUNT_DEFAULT = 24; public static final long MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT = -1L; + public static final long MAX_CHUNK_SIZE_IN_BYTES_DEFAULT = 32000000L; /* Parameter that enables using internal Parquet buffers for buffering of rows before serializing. It reduces memory consumption compared to using Java Objects for buffering.*/ @@ -136,6 +137,9 @@ private void setParameterMap(Map parameterOverrides, Properties ENABLE_PARQUET_INTERNAL_BUFFERING_DEFAULT, parameterOverrides, props); + + this.updateValue( + MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props); } /** @return Longest interval in milliseconds between buffer flushes */ @@ -261,6 +265,13 @@ public boolean getEnableParquetInternalBuffering() { return (val instanceof String) ? Boolean.parseBoolean(val.toString()) : (boolean) val; } + /** @return The max chunk size in bytes */ + public long getMaxChunkSizeInBytes() { + Object val = + this.parameterMap.getOrDefault(MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); + return (val instanceof String) ? Long.parseLong(val.toString()) : (long) val; + } + @Override public String toString() { return "ParameterProvider{" + "parameterMap=" + parameterMap + '}'; diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java index 9122e20be..55ab1bf30 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java +++ b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java @@ -4,8 +4,6 @@ package org.apache.parquet.hadoop; -import static net.snowflake.ingest.utils.Constants.MAX_CHUNK_SIZE_IN_BYTES; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.List; @@ -52,9 +50,10 @@ public BdecParquetWriter( ByteArrayOutputStream stream, MessageType schema, Map extraMetaData, - String channelName) + String channelName, + long maxChunkSizeInBytes) throws IOException { - OutputFile file = new ByteArrayOutputFile(stream); + OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes); ParquetProperties encodingProps = createParquetProperties(); Configuration conf = new Configuration(); WriteSupport> writeSupport = @@ -166,9 +165,11 @@ private static ParquetProperties createParquetProperties() { */ private static class ByteArrayOutputFile implements OutputFile { private final ByteArrayOutputStream stream; + private final long maxChunkSizeInBytes; - private ByteArrayOutputFile(ByteArrayOutputStream stream) { + private ByteArrayOutputFile(ByteArrayOutputStream stream, long maxChunkSizeInBytes) { this.stream = stream; + this.maxChunkSizeInBytes = maxChunkSizeInBytes; } @Override @@ -189,7 +190,7 @@ public boolean supportsBlockSize() { @Override public long defaultBlockSize() { - return (int) MAX_CHUNK_SIZE_IN_BYTES; + return maxChunkSizeInBytes; } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java index f7f8da84f..1fe034635 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java @@ -21,6 +21,7 @@ public void withValuesSet() { parameterMap.put(ParameterProvider.IO_TIME_CPU_RATIO, 10); parameterMap.put(ParameterProvider.BLOB_UPLOAD_MAX_RETRY_COUNT, 100); parameterMap.put(ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES, 1000L); + parameterMap.put(ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES, 1000000L); ParameterProvider parameterProvider = new ParameterProvider(parameterMap, prop); Assert.assertEquals(3L, parameterProvider.getBufferFlushIntervalInMs()); @@ -31,6 +32,7 @@ public void withValuesSet() { Assert.assertEquals(10, parameterProvider.getIOTimeCpuRatio()); Assert.assertEquals(100, parameterProvider.getBlobUploadMaxRetryCount()); Assert.assertEquals(1000L, parameterProvider.getMaxMemoryLimitInBytes()); + Assert.assertEquals(1000000L, parameterProvider.getMaxChunkSizeInBytes()); } @Test @@ -117,5 +119,8 @@ public void withDefaultValues() { Assert.assertEquals( ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT, parameterProvider.getMaxMemoryLimitInBytes()); + Assert.assertEquals( + ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, + parameterProvider.getMaxChunkSizeInBytes()); } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java index 55df1eb21..b6143e6ce 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java @@ -1,6 +1,7 @@ package net.snowflake.ingest.streaming.internal; import static java.time.ZoneOffset.UTC; +import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT; import java.math.BigDecimal; import java.math.BigInteger; @@ -131,7 +132,8 @@ private AbstractRowBuffer createTestBuffer(OpenChannelRequest.OnErrorOption o "test.buffer", rs -> {}, initialState, - enableParquetMemoryOptimization); + enableParquetMemoryOptimization, + MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); } @Test From e43c354901b1385be6f73f1a832516c838f8d784 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 17 May 2023 23:04:44 -0700 Subject: [PATCH 17/29] fix schema issue --- .../streaming/internal/FlushService.java | 35 ++++--- .../streaming/internal/FlushServiceTest.java | 94 +++++++++++++++---- 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 7758e30e6..39e17381c 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -373,22 +373,16 @@ void distributeFlushTasks() { int idx = 0; while (idx < channelsDataPerTable.size()) { ChannelData channelData = channelsDataPerTable.get(idx); - // Stop processing the rest of channels if reaching the blob size limit or the channel - // has different encryption key ids + // Stop processing the rest of channels when needed if (idx > 0 - && (totalBufferSizeInBytes + channelData.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES - || !Objects.equals( - channelData.getChannelContext().getEncryptionKeyId(), - channelsDataPerTable - .get(idx - 1) - .getChannelContext() - .getEncryptionKeyId()))) { + && shouldStopProcessing( + totalBufferSizeInBytes, channelData, channelsDataPerTable.get(idx - 1))) { leftoverChannelsDataPerTable.addAll( channelsDataPerTable.subList(idx, channelsDataPerTable.size())); logger.logInfo( "Creation of another blob is needed because of blob size limit or different" - + " encryption ids, client={}, table={}, size={}, encryptionId1={}," - + " encryptionId2={}", + + " encryption ids or different schema, client={}, table={}, size={}," + + " encryptionId1={}, encryptionId2={}", this.owningClient.getName(), channelData.getChannelContext().getTableName(), totalBufferSizeInBytes + channelData.getBufferSize(), @@ -471,6 +465,25 @@ void distributeFlushTasks() { this.registerService.addBlobs(blobs); } + /** + * Check whether we should stop merging more channels into the chunks, we need to stop in a few + * cases + * + *

When the size is larger than a certain threshold + * + *

When the encryption key ids are not the same + * + *

When the schema is not the same + */ + private boolean shouldStopProcessing( + float totalBufferSizeInBytes, ChannelData current, ChannelData prev) { + return totalBufferSizeInBytes + current.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES + || !Objects.equals( + current.getChannelContext().getEncryptionKeyId(), + prev.getChannelContext().getEncryptionKeyId()) + || !current.getColumnEps().keySet().equals(prev.getColumnEps().keySet()); + } + /** * Builds and uploads file to cloud storage. * diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index 0c485b94b..c75b16748 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -396,9 +396,9 @@ private SnowflakeStreamingIngestChannelInternal addChannel4(TestContext te .buildAndAdd(); } - private static ColumnMetadata createTestIntegerColumn() { + private static ColumnMetadata createTestIntegerColumn(String name) { ColumnMetadata colInt = new ColumnMetadata(); - colInt.setName("COLINT"); + colInt.setName(name); colInt.setPhysicalType("SB4"); colInt.setNullable(true); colInt.setLogicalType("FIXED"); @@ -407,9 +407,9 @@ private static ColumnMetadata createTestIntegerColumn() { return colInt; } - private static ColumnMetadata createTestTextColumn() { + private static ColumnMetadata createTestTextColumn(String name) { ColumnMetadata colChar = new ColumnMetadata(); - colChar.setName("COLCHAR"); + colChar.setName(name); colChar.setPhysicalType("LOB"); colChar.setNullable(true); colChar.setLogicalType("TEXT"); @@ -486,19 +486,22 @@ public void testBlobCreation() throws Exception { SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); SnowflakeStreamingIngestChannelInternal channel4 = addChannel4(testContext); + String colName1 = "testBlobCreation1"; + String colName2 = "testBlobCreation2"; - List schema = Arrays.asList(createTestIntegerColumn(), createTestTextColumn()); + List schema = + Arrays.asList(createTestIntegerColumn(colName1), createTestTextColumn(colName2)); channel1.getRowBuffer().setupSchema(schema); channel2.getRowBuffer().setupSchema(schema); channel4.getRowBuffer().setupSchema(schema); List> rows1 = RowSetBuilder.newBuilder() - .addColumn("COLINT", 11) - .addColumn("COLCHAR", "bob") + .addColumn(colName1, 11) + .addColumn(colName2, "bob") .newRow() - .addColumn("COLINT", 22) - .addColumn("COLCHAR", "bob") + .addColumn(colName1, 22) + .addColumn(colName2, "bob") .build(); channel1.insertRows(rows1, "offset1"); @@ -512,26 +515,78 @@ public void testBlobCreation() throws Exception { Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); } + @Test + public void testBlobSplitDueToDifferentSchema() throws Exception { + TestContext testContext = testContextFactory.create(); + SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); + SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); + String colName1 = "testBlobSplitDueToDifferentSchema1"; + String colName2 = "testBlobSplitDueToDifferentSchema2"; + String colName3 = "testBlobSplitDueToDifferentSchema3"; + + List schema1 = + Arrays.asList(createTestIntegerColumn(colName1), createTestTextColumn(colName2)); + List schema2 = + Arrays.asList( + createTestIntegerColumn(colName1), + createTestTextColumn(colName2), + createTestIntegerColumn(colName3)); + channel1.getRowBuffer().setupSchema(schema1); + channel2.getRowBuffer().setupSchema(schema2); + + List> rows1 = + RowSetBuilder.newBuilder() + .addColumn(colName1, 11) + .addColumn(colName2, "bob") + .newRow() + .addColumn(colName1, 22) + .addColumn(colName2, "bob") + .build(); + + List> rows2 = + RowSetBuilder.newBuilder() + .addColumn(colName1, 11) + .addColumn(colName2, "bob") + .addColumn(colName3, 11) + .newRow() + .addColumn(colName1, 22) + .addColumn(colName2, "bob") + .addColumn(colName3, 22) + .build(); + + channel1.insertRows(rows1, "offset1"); + channel2.insertRows(rows2, "offset2"); + + FlushService flushService = testContext.flushService; + + // Force = true flushes + flushService.flush(true).get(); + Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); + } + @Test public void testBuildAndUpload() throws Exception { TestContext testContext = testContextFactory.create(); SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); + String colName1 = "testBuildAndUpload1"; + String colName2 = "testBuildAndUpload2"; - List schema = Arrays.asList(createTestIntegerColumn(), createTestTextColumn()); + List schema = + Arrays.asList(createTestIntegerColumn(colName1), createTestTextColumn(colName2)); channel1.getRowBuffer().setupSchema(schema); channel2.getRowBuffer().setupSchema(schema); List> rows1 = RowSetBuilder.newBuilder() - .addColumn("COLINT", 11) - .addColumn("COLCHAR", "bob") + .addColumn(colName1, 11) + .addColumn(colName2, "bob") .newRow() - .addColumn("COLINT", 22) - .addColumn("COLCHAR", "bob") + .addColumn(colName1, 22) + .addColumn(colName2, "bob") .build(); List> rows2 = - RowSetBuilder.newBuilder().addColumn("COLINT", null).addColumn("COLCHAR", "toby").build(); + RowSetBuilder.newBuilder().addColumn(colName1, null).addColumn(colName2, "toby").build(); channel1.insertRows(rows1, "offset1"); channel2.insertRows(rows2, "offset2"); @@ -647,15 +702,18 @@ public void testBuildErrors() throws Exception { TestContext testContext = testContextFactory.create(); SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); SnowflakeStreamingIngestChannelInternal channel3 = addChannel3(testContext); + String colName1 = "testBuildErrors1"; + String colName2 = "testBuildErrors2"; - List schema = Arrays.asList(createTestIntegerColumn(), createTestTextColumn()); + List schema = + Arrays.asList(createTestIntegerColumn(colName1), createTestTextColumn(colName2)); channel1.getRowBuffer().setupSchema(schema); channel3.getRowBuffer().setupSchema(schema); List> rows1 = - RowSetBuilder.newBuilder().addColumn("COLINT", 0).addColumn("COLCHAR", "alice").build(); + RowSetBuilder.newBuilder().addColumn(colName1, 0).addColumn(colName2, "alice").build(); List> rows2 = - RowSetBuilder.newBuilder().addColumn("COLINT", 0).addColumn("COLCHAR", 111).build(); + RowSetBuilder.newBuilder().addColumn(colName1, 0).addColumn(colName2, 111).build(); channel1.insertRows(rows1, "offset1"); channel3.insertRows(rows2, "offset2"); From e016947afa38d2abab11a1106234db8fe6dfc99a Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 17 May 2023 23:10:51 -0700 Subject: [PATCH 18/29] fix --- .../ingest/streaming/internal/FlushService.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 39e17381c..45a65b75e 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -382,12 +382,14 @@ && shouldStopProcessing( logger.logInfo( "Creation of another blob is needed because of blob size limit or different" + " encryption ids or different schema, client={}, table={}, size={}," - + " encryptionId1={}, encryptionId2={}", + + " encryptionId1={}, encryptionId2={}, schema1={}, schema2={}", this.owningClient.getName(), channelData.getChannelContext().getTableName(), totalBufferSizeInBytes + channelData.getBufferSize(), channelData.getChannelContext().getEncryptionKeyId(), - channelsDataPerTable.get(idx - 1).getChannelContext().getEncryptionKeyId()); + channelsDataPerTable.get(idx - 1).getChannelContext().getEncryptionKeyId(), + channelData.getColumnEps().keySet(), + channelsDataPerTable.get(idx - 1).getColumnEps().keySet()); break; } totalBufferSizeInBytes += channelData.getBufferSize(); @@ -466,14 +468,14 @@ && shouldStopProcessing( } /** - * Check whether we should stop merging more channels into the chunks, we need to stop in a few - * cases + * Check whether we should stop merging more channels into the same chunk, we need to stop in a + * few cases * *

When the size is larger than a certain threshold * *

When the encryption key ids are not the same * - *

When the schema is not the same + *

When the schemas are not the same */ private boolean shouldStopProcessing( float totalBufferSizeInBytes, ChannelData current, ChannelData prev) { From 12f9eb288037f435907728ef4548b31091891d4c Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Thu, 18 May 2023 11:48:58 -0700 Subject: [PATCH 19/29] address comment --- .../net/snowflake/ingest/streaming/internal/FlushService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 45a65b75e..591c1db85 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -469,7 +469,7 @@ && shouldStopProcessing( /** * Check whether we should stop merging more channels into the same chunk, we need to stop in a - * few cases + * few cases: * *

When the size is larger than a certain threshold * From ab0620f1729c0ec6771b669bc9aaa7668d623b00 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Sat, 3 Jun 2023 23:10:12 -0700 Subject: [PATCH 20/29] save progress --- .../streaming/internal/FlushService.java | 29 ++++++++--- .../streaming/internal/FlushServiceTest.java | 51 +++++++++++++++++++ 2 files changed, 73 insertions(+), 7 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index ae4d04a23..74d53f569 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -370,21 +370,28 @@ void distributeFlushTasks() { if (!channelsDataPerTable.isEmpty()) { int idx = 0; + float totalBufferSizePerTableInBytes = 0F; while (idx < channelsDataPerTable.size()) { ChannelData channelData = channelsDataPerTable.get(idx); // Stop processing the rest of channels when needed if (idx > 0 && shouldStopProcessing( - totalBufferSizeInBytes, channelData, channelsDataPerTable.get(idx - 1))) { + totalBufferSizeInBytes, + totalBufferSizePerTableInBytes, + channelData, + channelsDataPerTable.get(idx - 1))) { leftoverChannelsDataPerTable.addAll( channelsDataPerTable.subList(idx, channelsDataPerTable.size())); logger.logInfo( - "Creation of another blob is needed because of blob size limit or different" - + " encryption ids or different schema, client={}, table={}, size={}," - + " encryptionId1={}, encryptionId2={}, schema1={}, schema2={}", + "Creation of another blob is needed because of blob/chunk size limit or" + + " different encryption ids or different schema, client={}, table={}," + + " fileSize={}, chunkSize={}, nextChannelSize={}, encryptionId1={}," + + " encryptionId2={}, schema1={}, schema2={}", this.owningClient.getName(), channelData.getChannelContext().getTableName(), - totalBufferSizeInBytes + channelData.getBufferSize(), + totalBufferSizeInBytes, + totalBufferSizePerTableInBytes, + channelData.getBufferSize(), channelData.getChannelContext().getEncryptionKeyId(), channelsDataPerTable.get(idx - 1).getChannelContext().getEncryptionKeyId(), channelData.getColumnEps().keySet(), @@ -392,6 +399,7 @@ && shouldStopProcessing( break; } totalBufferSizeInBytes += channelData.getBufferSize(); + totalBufferSizePerTableInBytes += channelData.getBufferSize(); idx++; } // Add processed channels to the current blob, stop if we need to create a new blob @@ -473,15 +481,22 @@ && shouldStopProcessing( * Check whether we should stop merging more channels into the same chunk, we need to stop in a * few cases: * - *

When the size is larger than a certain threshold + *

When the file size is larger than a certain threshold + * + *

When the chunk size is larger than a certain threshold * *

When the encryption key ids are not the same * *

When the schemas are not the same */ private boolean shouldStopProcessing( - float totalBufferSizeInBytes, ChannelData current, ChannelData prev) { + float totalBufferSizeInBytes, + float totalBufferSizePerTableInBytes, + ChannelData current, + ChannelData prev) { return totalBufferSizeInBytes + current.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES + || totalBufferSizePerTableInBytes + current.getBufferSize() + > this.owningClient.getParameterProvider().getMaxChunkSizeInBytes() || !Objects.equals( current.getChannelContext().getEncryptionKeyId(), prev.getChannelContext().getEncryptionKeyId()) diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index fc049c297..39632d0a7 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -360,6 +360,18 @@ private static ColumnMetadata createTestTextColumn(String name) { return colChar; } + private static ColumnMetadata createLargeTestTextColumn(String name) { + ColumnMetadata colChar = new ColumnMetadata(); + colChar.setName(name); + colChar.setPhysicalType("LOB"); + colChar.setNullable(true); + colChar.setLogicalType("TEXT"); + colChar.setByteLength(14000000); + colChar.setLength(11000000); + colChar.setScale(0); + return colChar; + } + @Test public void testGetFilePath() { TestContext testContext = testContextFactory.create(); @@ -505,6 +517,45 @@ public void testBlobSplitDueToDifferentSchema() throws Exception { Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); } + @Test + public void testBlobSplitDueToChunkSizeLimit() throws Exception { + TestContext testContext = testContextFactory.create(); + SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); + SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); + String colName1 = "testBlobSplitDueToDifferentSchema1"; + String colName2 = "testBlobSplitDueToDifferentSchema2"; + String largeData = new String(new char[10000000]); + + List schema = + Arrays.asList(createTestIntegerColumn(colName1), createLargeTestTextColumn(colName2)); + channel1.getRowBuffer().setupSchema(schema); + channel2.getRowBuffer().setupSchema(schema); + + List> rows = + RowSetBuilder.newBuilder() + .addColumn(colName1, 11) + .addColumn(colName2, largeData) + .newRow() + .addColumn(colName1, 22) + .addColumn(colName2, largeData) + .newRow() + .addColumn(colName1, 33) + .addColumn(colName2, largeData) + .newRow() + .addColumn(colName1, 44) + .addColumn(colName2, largeData) + .build(); + + channel1.insertRows(rows, "offset1"); + channel2.insertRows(rows, "offset2"); + + FlushService flushService = testContext.flushService; + + // Force = true flushes + flushService.flush(true).get(); + Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); + } + @Test public void testBuildAndUpload() throws Exception { long expectedBuildLatencyMs = 100; From eaa729a6254b61d3d86c59caa37c6b7885ac46b3 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Mon, 5 Jun 2023 00:34:07 -0700 Subject: [PATCH 21/29] add tests --- .../streaming/internal/FlushService.java | 2 +- .../ingest/utils/ParameterProvider.java | 19 ++++++++++++ .../streaming/internal/FlushServiceTest.java | 30 ++++++++----------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 74d53f569..6fe196680 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -496,7 +496,7 @@ private boolean shouldStopProcessing( ChannelData prev) { return totalBufferSizeInBytes + current.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES || totalBufferSizePerTableInBytes + current.getBufferSize() - > this.owningClient.getParameterProvider().getMaxChunkSizeInBytes() + > this.owningClient.getParameterProvider().getMaxChunkSizeInBytesToAvoidOom() || !Objects.equals( current.getChannelContext().getEncryptionKeyId(), prev.getChannelContext().getEncryptionKeyId()) diff --git a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java index 7faea2859..e07cfaf1b 100644 --- a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java +++ b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java @@ -25,7 +25,11 @@ public class ParameterProvider { public static final String MAX_MEMORY_LIMIT_IN_BYTES = "MAX_MEMORY_LIMIT_IN_BYTES".toLowerCase(); public static final String ENABLE_PARQUET_INTERNAL_BUFFERING = "ENABLE_PARQUET_INTERNAL_BUFFERING".toLowerCase(); + // This is actually channel size limit at this moment until we implement the size tracking logic + // at table/chunk level public static final String MAX_CHUNK_SIZE_IN_BYTES = "MAX_CHUNK_SIZE_IN_BYTES".toLowerCase(); + public static final String MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM = + "MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM".toLowerCase(); // Default values public static final long BUFFER_FLUSH_INTERVAL_IN_MILLIS_DEFAULT = 1000; @@ -40,6 +44,7 @@ public class ParameterProvider { public static final int BLOB_UPLOAD_MAX_RETRY_COUNT_DEFAULT = 24; public static final long MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT = -1L; public static final long MAX_CHUNK_SIZE_IN_BYTES_DEFAULT = 32000000L; + public static final long MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT = 128000000L; /* Parameter that enables using internal Parquet buffers for buffering of rows before serializing. It reduces memory consumption compared to using Java Objects for buffering.*/ @@ -140,6 +145,12 @@ private void setParameterMap(Map parameterOverrides, Properties this.updateValue( MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props); + + this.updateValue( + MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM, + MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT, + parameterOverrides, + props); } /** @return Longest interval in milliseconds between buffer flushes */ @@ -272,6 +283,14 @@ public long getMaxChunkSizeInBytes() { return (val instanceof String) ? Long.parseLong(val.toString()) : (long) val; } + /** @return The max chunk size in bytes that could avoid OOM at server side */ + public long getMaxChunkSizeInBytesToAvoidOom() { + Object val = + this.parameterMap.getOrDefault( + MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM, MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT); + return (val instanceof String) ? Long.parseLong(val.toString()) : (long) val; + } + @Override public String toString() { return "ParameterProvider{" + "parameterMap=" + parameterMap + '}'; diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index 39632d0a7..99669eb03 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -7,6 +7,7 @@ import static net.snowflake.ingest.utils.Constants.BLOB_NO_HEADER; import static net.snowflake.ingest.utils.Constants.BLOB_TAG_SIZE_IN_BYTES; import static net.snowflake.ingest.utils.Constants.BLOB_VERSION_SIZE_IN_BYTES; +import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT; import com.codahale.metrics.Histogram; import com.codahale.metrics.Meter; @@ -522,29 +523,24 @@ public void testBlobSplitDueToChunkSizeLimit() throws Exception { TestContext testContext = testContextFactory.create(); SnowflakeStreamingIngestChannelInternal channel1 = addChannel1(testContext); SnowflakeStreamingIngestChannelInternal channel2 = addChannel2(testContext); - String colName1 = "testBlobSplitDueToDifferentSchema1"; - String colName2 = "testBlobSplitDueToDifferentSchema2"; - String largeData = new String(new char[10000000]); + String colName1 = "testBlobSplitDueToChunkSizeLimit1"; + String colName2 = "testBlobSplitDueToChunkSizeLimit2"; + int rowSize = 10000000; + String largeData = new String(new char[rowSize]); List schema = Arrays.asList(createTestIntegerColumn(colName1), createLargeTestTextColumn(colName2)); channel1.getRowBuffer().setupSchema(schema); channel2.getRowBuffer().setupSchema(schema); - List> rows = - RowSetBuilder.newBuilder() - .addColumn(colName1, 11) - .addColumn(colName2, largeData) - .newRow() - .addColumn(colName1, 22) - .addColumn(colName2, largeData) - .newRow() - .addColumn(colName1, 33) - .addColumn(colName2, largeData) - .newRow() - .addColumn(colName1, 44) - .addColumn(colName2, largeData) - .build(); + RowSetBuilder builder = RowSetBuilder.newBuilder(); + RowSetBuilder.newBuilder().addColumn(colName1, 11).addColumn(colName2, largeData); + + for (int idx = 0; idx <= MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT / (2 * rowSize); idx++) { + builder.addColumn(colName1, 11).addColumn(colName2, largeData).newRow(); + } + + List> rows = builder.build(); channel1.insertRows(rows, "offset1"); channel2.insertRows(rows, "offset2"); From f17ba1bde2943476a26d5e996ffa9e6e0ec3ed56 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Mon, 5 Jun 2023 18:40:55 -0700 Subject: [PATCH 22/29] fix tests --- .../snowflake/ingest/streaming/internal/FlushService.java | 7 ++++++- .../ingest/streaming/internal/FlushServiceTest.java | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index ff2c51e20..310fb1f9b 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -259,7 +259,7 @@ CompletableFuture flush(boolean isForce) { if (isForce || (!DISABLE_BACKGROUND_FLUSH - && !this.isTestMode + && !isTestMode() && (this.isNeedFlush || timeDiffMillis >= this.owningClient.getParameterProvider().getBufferFlushIntervalInMs()))) { @@ -673,4 +673,9 @@ boolean throttleDueToQueuedFlushTasks() { } return throttleOnQueuedTasks; } + + /** Get whether we're running under test mode */ + boolean isTestMode() { + return this.isTestMode; + } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index 99669eb03..aa8a3c8b6 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -93,7 +93,7 @@ private abstract static class TestContext implements AutoCloseable { channelCache = new ChannelCache<>(); Mockito.when(client.getChannelCache()).thenReturn(channelCache); registerService = Mockito.spy(new RegisterService(client, client.isTestMode())); - flushService = Mockito.spy(new FlushService<>(client, channelCache, stage, false)); + flushService = Mockito.spy(new FlushService<>(client, channelCache, stage, true)); } ChannelData flushChannel(String name) { @@ -411,6 +411,7 @@ public void testGetFilePath() { public void testFlush() throws Exception { TestContext testContext = testContextFactory.create(); FlushService flushService = testContext.flushService; + Mockito.when(flushService.isTestMode()).thenReturn(false); // Nothing to flush flushService.flush(false).get(); @@ -549,7 +550,7 @@ public void testBlobSplitDueToChunkSizeLimit() throws Exception { // Force = true flushes flushService.flush(true).get(); - Mockito.verify(flushService, Mockito.atLeast(2)).buildAndUpload(Mockito.any(), Mockito.any()); + Mockito.verify(flushService, Mockito.times(2)).buildAndUpload(Mockito.any(), Mockito.any()); } @Test From e925d83a596e74dd8cc7450b5d1df29f73be3e1e Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 7 Jun 2023 14:29:29 -0700 Subject: [PATCH 23/29] address comments --- .../streaming/internal/FlushService.java | 2 +- ...owflakeStreamingIngestChannelInternal.java | 6 ++-- .../ingest/utils/ParameterProvider.java | 29 ++++++++----------- .../streaming/internal/FlushServiceTest.java | 4 +-- .../internal/ParameterProviderTest.java | 8 ++--- .../streaming/internal/RowBufferTest.java | 4 +-- 6 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java index 310fb1f9b..86358b24d 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java @@ -496,7 +496,7 @@ private boolean shouldStopProcessing( ChannelData prev) { return totalBufferSizeInBytes + current.getBufferSize() > MAX_BLOB_SIZE_IN_BYTES || totalBufferSizePerTableInBytes + current.getBufferSize() - > this.owningClient.getParameterProvider().getMaxChunkSizeInBytesToAvoidOom() + > this.owningClient.getParameterProvider().getMaxChunkSizeInBytes() || !Objects.equals( current.getChannelContext().getEncryptionKeyId(), prev.getChannelContext().getEncryptionKeyId()) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java b/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java index b2d523a58..f5dfb73a6 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelInternal.java @@ -126,8 +126,8 @@ class SnowflakeStreamingIngestChannelInternal implements SnowflakeStreamingIn ? owningClient.getParameterProvider().getEnableParquetInternalBuffering() : ParameterProvider.ENABLE_PARQUET_INTERNAL_BUFFERING_DEFAULT, owningClient != null - ? owningClient.getParameterProvider().getMaxChunkSizeInBytes() - : ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); + ? owningClient.getParameterProvider().getMaxChannelSizeInBytes() + : ParameterProvider.MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT); logger.logInfo( "Channel={} created for table={}", this.channelFlushContext.getName(), @@ -362,7 +362,7 @@ public InsertValidationResponse insertRows( // TODO: Checking table/chunk level size reduces throughput a lot, we may want to check it only // if a large number of rows are inserted if (this.rowBuffer.getSize() - >= this.owningClient.getParameterProvider().getMaxChunkSizeInBytes()) { + >= this.owningClient.getParameterProvider().getMaxChannelSizeInBytes()) { this.owningClient.setNeedFlush(); } diff --git a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java index e07cfaf1b..451682f52 100644 --- a/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java +++ b/src/main/java/net/snowflake/ingest/utils/ParameterProvider.java @@ -25,11 +25,9 @@ public class ParameterProvider { public static final String MAX_MEMORY_LIMIT_IN_BYTES = "MAX_MEMORY_LIMIT_IN_BYTES".toLowerCase(); public static final String ENABLE_PARQUET_INTERNAL_BUFFERING = "ENABLE_PARQUET_INTERNAL_BUFFERING".toLowerCase(); - // This is actually channel size limit at this moment until we implement the size tracking logic - // at table/chunk level + // This should not be needed once we have the ability to track size at table/chunk level + public static final String MAX_CHANNEL_SIZE_IN_BYTES = "MAX_CHANNEL_SIZE_IN_BYTES".toLowerCase(); public static final String MAX_CHUNK_SIZE_IN_BYTES = "MAX_CHUNK_SIZE_IN_BYTES".toLowerCase(); - public static final String MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM = - "MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM".toLowerCase(); // Default values public static final long BUFFER_FLUSH_INTERVAL_IN_MILLIS_DEFAULT = 1000; @@ -43,8 +41,8 @@ public class ParameterProvider { public static final int IO_TIME_CPU_RATIO_DEFAULT = 2; public static final int BLOB_UPLOAD_MAX_RETRY_COUNT_DEFAULT = 24; public static final long MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT = -1L; - public static final long MAX_CHUNK_SIZE_IN_BYTES_DEFAULT = 32000000L; - public static final long MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT = 128000000L; + public static final long MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT = 32000000L; + public static final long MAX_CHUNK_SIZE_IN_BYTES_DEFAULT = 128000000L; /* Parameter that enables using internal Parquet buffers for buffering of rows before serializing. It reduces memory consumption compared to using Java Objects for buffering.*/ @@ -144,13 +142,10 @@ private void setParameterMap(Map parameterOverrides, Properties props); this.updateValue( - MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props); + MAX_CHANNEL_SIZE_IN_BYTES, MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props); this.updateValue( - MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM, - MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT, - parameterOverrides, - props); + MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, parameterOverrides, props); } /** @return Longest interval in milliseconds between buffer flushes */ @@ -276,18 +271,18 @@ public boolean getEnableParquetInternalBuffering() { return (val instanceof String) ? Boolean.parseBoolean(val.toString()) : (boolean) val; } - /** @return The max chunk size in bytes */ - public long getMaxChunkSizeInBytes() { + /** @return The max channel size in bytes */ + public long getMaxChannelSizeInBytes() { Object val = - this.parameterMap.getOrDefault(MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); + this.parameterMap.getOrDefault( + MAX_CHANNEL_SIZE_IN_BYTES, MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT); return (val instanceof String) ? Long.parseLong(val.toString()) : (long) val; } /** @return The max chunk size in bytes that could avoid OOM at server side */ - public long getMaxChunkSizeInBytesToAvoidOom() { + public long getMaxChunkSizeInBytes() { Object val = - this.parameterMap.getOrDefault( - MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM, MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT); + this.parameterMap.getOrDefault(MAX_CHUNK_SIZE_IN_BYTES, MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); return (val instanceof String) ? Long.parseLong(val.toString()) : (long) val; } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java index aa8a3c8b6..38e6ac8ca 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/FlushServiceTest.java @@ -7,7 +7,7 @@ import static net.snowflake.ingest.utils.Constants.BLOB_NO_HEADER; import static net.snowflake.ingest.utils.Constants.BLOB_TAG_SIZE_IN_BYTES; import static net.snowflake.ingest.utils.Constants.BLOB_VERSION_SIZE_IN_BYTES; -import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT; +import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT; import com.codahale.metrics.Histogram; import com.codahale.metrics.Meter; @@ -537,7 +537,7 @@ public void testBlobSplitDueToChunkSizeLimit() throws Exception { RowSetBuilder builder = RowSetBuilder.newBuilder(); RowSetBuilder.newBuilder().addColumn(colName1, 11).addColumn(colName2, largeData); - for (int idx = 0; idx <= MAX_CHUNK_SIZE_IN_BYTES_TO_AVOID_OOM_DEFAULT / (2 * rowSize); idx++) { + for (int idx = 0; idx <= MAX_CHUNK_SIZE_IN_BYTES_DEFAULT / (2 * rowSize); idx++) { builder.addColumn(colName1, 11).addColumn(colName2, largeData).newRow(); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java index 1fe034635..dcf4037c6 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/ParameterProviderTest.java @@ -21,7 +21,7 @@ public void withValuesSet() { parameterMap.put(ParameterProvider.IO_TIME_CPU_RATIO, 10); parameterMap.put(ParameterProvider.BLOB_UPLOAD_MAX_RETRY_COUNT, 100); parameterMap.put(ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES, 1000L); - parameterMap.put(ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES, 1000000L); + parameterMap.put(ParameterProvider.MAX_CHANNEL_SIZE_IN_BYTES, 1000000L); ParameterProvider parameterProvider = new ParameterProvider(parameterMap, prop); Assert.assertEquals(3L, parameterProvider.getBufferFlushIntervalInMs()); @@ -32,7 +32,7 @@ public void withValuesSet() { Assert.assertEquals(10, parameterProvider.getIOTimeCpuRatio()); Assert.assertEquals(100, parameterProvider.getBlobUploadMaxRetryCount()); Assert.assertEquals(1000L, parameterProvider.getMaxMemoryLimitInBytes()); - Assert.assertEquals(1000000L, parameterProvider.getMaxChunkSizeInBytes()); + Assert.assertEquals(1000000L, parameterProvider.getMaxChannelSizeInBytes()); } @Test @@ -120,7 +120,7 @@ public void withDefaultValues() { ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT, parameterProvider.getMaxMemoryLimitInBytes()); Assert.assertEquals( - ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT, - parameterProvider.getMaxChunkSizeInBytes()); + ParameterProvider.MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT, + parameterProvider.getMaxChannelSizeInBytes()); } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java index b6e035c90..b60fbf9f9 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java @@ -1,7 +1,7 @@ package net.snowflake.ingest.streaming.internal; import static java.time.ZoneOffset.UTC; -import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHUNK_SIZE_IN_BYTES_DEFAULT; +import static net.snowflake.ingest.utils.ParameterProvider.MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT; import java.math.BigDecimal; import java.math.BigInteger; @@ -115,7 +115,7 @@ private AbstractRowBuffer createTestBuffer(OpenChannelRequest.OnErrorOption o rs -> {}, initialState, enableParquetMemoryOptimization, - MAX_CHUNK_SIZE_IN_BYTES_DEFAULT); + MAX_CHANNEL_SIZE_IN_BYTES_DEFAULT); } @Test From efec4e9e58dc27cf7fa97fddab0a5bbf7c6de3b8 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Thu, 8 Jun 2023 13:00:25 -0700 Subject: [PATCH 24/29] fix naming --- .../ingest/streaming/internal/AbstractRowBuffer.java | 4 ++-- .../ingest/streaming/internal/ParquetFlusher.java | 12 ++++++++---- .../ingest/streaming/internal/ParquetRowBuffer.java | 10 +++++----- .../org/apache/parquet/hadoop/BdecParquetWriter.java | 12 ++++++------ 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index c5d0bddac..5fb24e64d 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -529,7 +529,7 @@ static AbstractRowBuffer createRowBuffer( Consumer rowSizeMetric, ChannelRuntimeState channelRuntimeState, boolean enableParquetMemoryOptimization, - long maxChunkSizeInBytes) { + long maxChannelSizeInBytes) { switch (bdecVersion) { case THREE: //noinspection unchecked @@ -541,7 +541,7 @@ static AbstractRowBuffer createRowBuffer( rowSizeMetric, channelRuntimeState, enableParquetMemoryOptimization, - maxChunkSizeInBytes); + maxChannelSizeInBytes); default: throw new SFException( ErrorCode.INTERNAL_ERROR, "Unsupported BDEC format version: " + bdecVersion); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java index b0c14b12f..295fb9379 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java @@ -25,17 +25,17 @@ public class ParquetFlusher implements Flusher { private static final Logging logger = new Logging(ParquetFlusher.class); private final MessageType schema; private final boolean enableParquetInternalBuffering; - private final long maxChunkSizeInBytes; + private final long maxChannelSizeInBytes; /** * Construct parquet flusher from its schema and set flag that indicates whether Parquet memory * optimization is enabled, i.e. rows will be buffered in internal Parquet buffer. */ public ParquetFlusher( - MessageType schema, boolean enableParquetInternalBuffering, long maxChunkSizeInBytes) { + MessageType schema, boolean enableParquetInternalBuffering, long maxChannelSizeInBytes) { this.schema = schema; this.enableParquetInternalBuffering = enableParquetInternalBuffering; - this.maxChunkSizeInBytes = maxChunkSizeInBytes; + this.maxChannelSizeInBytes = maxChannelSizeInBytes; } @Override @@ -198,7 +198,11 @@ private SerializationResult serializeFromJavaObjects( Map metadata = channelsDataPerTable.get(0).getVectors().metadata; parquetWriter = new BdecParquetWriter( - mergedData, schema, metadata, firstChannelFullyQualifiedTableName, maxChunkSizeInBytes); + mergedData, + schema, + metadata, + firstChannelFullyQualifiedTableName, + maxChannelSizeInBytes); rows.forEach(parquetWriter::writeRow); parquetWriter.close(); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index 03b1c1762..e1a8ec08d 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -52,7 +52,7 @@ public class ParquetRowBuffer extends AbstractRowBuffer { private MessageType schema; private final boolean enableParquetInternalBuffering; - private final long maxChunkSizeInBytes; + private final long maxChannelSizeInBytes; /** Construct a ParquetRowBuffer object. */ ParquetRowBuffer( @@ -62,7 +62,7 @@ public class ParquetRowBuffer extends AbstractRowBuffer { Consumer rowSizeMetric, ChannelRuntimeState channelRuntimeState, boolean enableParquetInternalBuffering, - long maxChunkSizeInBytes) { + long maxChannelSizeInBytes) { super( onErrorOption, defaultTimezone, @@ -75,7 +75,7 @@ public class ParquetRowBuffer extends AbstractRowBuffer { this.tempData = new ArrayList<>(); this.channelName = fullyQualifiedChannelName; this.enableParquetInternalBuffering = enableParquetInternalBuffering; - this.maxChunkSizeInBytes = maxChunkSizeInBytes; + this.maxChannelSizeInBytes = maxChannelSizeInBytes; } @Override @@ -120,7 +120,7 @@ private void createFileWriter() { try { if (enableParquetInternalBuffering) { bdecParquetWriter = - new BdecParquetWriter(fileOutput, schema, metadata, channelName, maxChunkSizeInBytes); + new BdecParquetWriter(fileOutput, schema, metadata, channelName, maxChannelSizeInBytes); } else { this.bdecParquetWriter = null; } @@ -308,7 +308,7 @@ void closeInternal() { @Override public Flusher createFlusher() { - return new ParquetFlusher(schema, enableParquetInternalBuffering, maxChunkSizeInBytes); + return new ParquetFlusher(schema, enableParquetInternalBuffering, maxChannelSizeInBytes); } private static class ParquetColumn { diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java index b2442d3dc..a7a640e0f 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java +++ b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java @@ -51,9 +51,9 @@ public BdecParquetWriter( MessageType schema, Map extraMetaData, String channelName, - long maxChunkSizeInBytes) + long maxChannelSizeInBytes) throws IOException { - OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes); + OutputFile file = new ByteArrayOutputFile(stream, maxChannelSizeInBytes); ParquetProperties encodingProps = createParquetProperties(); Configuration conf = new Configuration(); WriteSupport> writeSupport = @@ -164,11 +164,11 @@ private static ParquetProperties createParquetProperties() { */ private static class ByteArrayOutputFile implements OutputFile { private final ByteArrayOutputStream stream; - private final long maxChunkSizeInBytes; + private final long maxChannelSizeInBytes; - private ByteArrayOutputFile(ByteArrayOutputStream stream, long maxChunkSizeInBytes) { + private ByteArrayOutputFile(ByteArrayOutputStream stream, long maxChannelSizeInBytes) { this.stream = stream; - this.maxChunkSizeInBytes = maxChunkSizeInBytes; + this.maxChannelSizeInBytes = maxChannelSizeInBytes; } @Override @@ -189,7 +189,7 @@ public boolean supportsBlockSize() { @Override public long defaultBlockSize() { - return maxChunkSizeInBytes; + return maxChannelSizeInBytes; } } From 65b75b148134d4a9f20bf438c2d1480741470801 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 12 Jul 2023 21:42:18 +0000 Subject: [PATCH 25/29] fix --- .../ingest/streaming/internal/ParquetFlusher.java | 12 ++++-------- .../org/apache/parquet/hadoop/BdecParquetWriter.java | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java index 295fb9379..b0c14b12f 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java @@ -25,17 +25,17 @@ public class ParquetFlusher implements Flusher { private static final Logging logger = new Logging(ParquetFlusher.class); private final MessageType schema; private final boolean enableParquetInternalBuffering; - private final long maxChannelSizeInBytes; + private final long maxChunkSizeInBytes; /** * Construct parquet flusher from its schema and set flag that indicates whether Parquet memory * optimization is enabled, i.e. rows will be buffered in internal Parquet buffer. */ public ParquetFlusher( - MessageType schema, boolean enableParquetInternalBuffering, long maxChannelSizeInBytes) { + MessageType schema, boolean enableParquetInternalBuffering, long maxChunkSizeInBytes) { this.schema = schema; this.enableParquetInternalBuffering = enableParquetInternalBuffering; - this.maxChannelSizeInBytes = maxChannelSizeInBytes; + this.maxChunkSizeInBytes = maxChunkSizeInBytes; } @Override @@ -198,11 +198,7 @@ private SerializationResult serializeFromJavaObjects( Map metadata = channelsDataPerTable.get(0).getVectors().metadata; parquetWriter = new BdecParquetWriter( - mergedData, - schema, - metadata, - firstChannelFullyQualifiedTableName, - maxChannelSizeInBytes); + mergedData, schema, metadata, firstChannelFullyQualifiedTableName, maxChunkSizeInBytes); rows.forEach(parquetWriter::writeRow); parquetWriter.close(); diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java index a7a640e0f..b21821520 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java +++ b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java @@ -51,9 +51,9 @@ public BdecParquetWriter( MessageType schema, Map extraMetaData, String channelName, - long maxChannelSizeInBytes) + long maxChunkSizeInBytes) throws IOException { - OutputFile file = new ByteArrayOutputFile(stream, maxChannelSizeInBytes); + OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes); ParquetProperties encodingProps = createParquetProperties(); Configuration conf = new Configuration(); WriteSupport> writeSupport = From e96688cda4c247936bbcb19e3e1b6e478622e5a7 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 12 Jul 2023 21:43:35 +0000 Subject: [PATCH 26/29] fix --- .../java/org/apache/parquet/hadoop/BdecParquetWriter.java | 8 ++++---- .../ingest/streaming/internal/DataValidationUtilTest.java | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java index b21821520..b2442d3dc 100644 --- a/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java +++ b/src/main/java/org/apache/parquet/hadoop/BdecParquetWriter.java @@ -164,11 +164,11 @@ private static ParquetProperties createParquetProperties() { */ private static class ByteArrayOutputFile implements OutputFile { private final ByteArrayOutputStream stream; - private final long maxChannelSizeInBytes; + private final long maxChunkSizeInBytes; - private ByteArrayOutputFile(ByteArrayOutputStream stream, long maxChannelSizeInBytes) { + private ByteArrayOutputFile(ByteArrayOutputStream stream, long maxChunkSizeInBytes) { this.stream = stream; - this.maxChannelSizeInBytes = maxChannelSizeInBytes; + this.maxChunkSizeInBytes = maxChunkSizeInBytes; } @Override @@ -189,7 +189,7 @@ public boolean supportsBlockSize() { @Override public long defaultBlockSize() { - return maxChannelSizeInBytes; + return maxChunkSizeInBytes; } } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java index f6f9e36af..6cd7a018e 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java @@ -1011,8 +1011,8 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index: 0, reason:" - + " Not a valid boolean, see" + + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index: 0," + + " reason: Not a valid boolean, see" + " https://docs.snowflake.com/en/sql-reference/data-types-logical.html#conversion-to-boolean" + " for the list of supported formats", () -> validateAndParseBoolean("COL", "abc", 0)); From efa8e4ea26a907ea95ec6a5833f63013387ca5e7 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 20 Sep 2023 22:49:44 +0000 Subject: [PATCH 27/29] add row index --- .../streaming/internal/AbstractRowBuffer.java | 4 +-- .../internal/DataValidationUtil.java | 16 ++++++--- .../streaming/internal/ParquetRowBuffer.java | 4 +-- .../internal/DataValidationUtilTest.java | 36 +++++++++---------- .../streaming/internal/RowBufferTest.java | 2 +- .../SnowflakeStreamingIngestChannelTest.java | 4 +-- 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index fa2c65006..a3ed90c54 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -260,7 +260,7 @@ Set verifyInputColumns( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, "Extra columns: " + extraCols, - "Columns not present in the table shouldn't be specified."); + "Columns not present in the table shouldn't be specified. Row Index:%d"); } // Check for missing columns in the row @@ -278,7 +278,7 @@ Set verifyInputColumns( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, "Missing columns: " + missingCols, - "Values for all non-nullable columns must be specified."); + "Values for all non-nullable columns must be specified. Row Index:%d"); } return inputColNamesMap.keySet(); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index 1bdfc2095..7982ca6a4 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -457,7 +457,10 @@ static TimestampWrapper validateAndParseTimestamp( if (offsetDateTime.getYear() < 1 || offsetDateTime.getYear() > 9999) { throw new SFException( ErrorCode.INVALID_VALUE_ROW, - "Timestamp out of representable inclusive range of years between 1 and 9999"); + String.format( + "Timestamp out of representable inclusive range of years between 1 and 9999, Row" + + " Index:%d", + insertRowIndex)); } return new TimestampWrapper(offsetDateTime, scale); } @@ -588,7 +591,10 @@ static int validateAndParseDate(String columnName, Object input, long insertRowI if (offsetDateTime.getYear() < -9999 || offsetDateTime.getYear() > 9999) { throw new SFException( ErrorCode.INVALID_VALUE_ROW, - "Date out of representable inclusive range of years between -9999 and 9999"); + String.format( + "Date out of representable inclusive range of years between -9999 and 9999, Row" + + " Index:%d", + insertRowIndex)); } return Math.toIntExact(offsetDateTime.toLocalDate().toEpochDay()); @@ -814,7 +820,7 @@ static void checkValueInRange( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, String.format( - "Number out of representable exclusive range of (-1e%s..1e%s), Row Index:%s", + "Number out of representable exclusive range of (-1e%s..1e%s), Row Index:%d", precision - scale, precision - scale, insertRowIndex)); } } @@ -859,7 +865,7 @@ private static SFException typeNotAllowedException( ErrorCode.INVALID_FORMAT_ROW, String.format( "Object of type %s cannot be ingested into Snowflake column %s of type %s, Row" - + " Index:%s", + + " Index:%d", javaType.getName(), columnName, snowflakeType, insertRowIndex), String.format( String.format("Allowed Java types: %s", String.join(", ", allowedJavaTypes)))); @@ -882,7 +888,7 @@ private static SFException valueFormatNotAllowedException( return new SFException( ErrorCode.INVALID_VALUE_ROW, String.format( - "Value cannot be ingested into Snowflake column %s of type %s, Row Index: %s, reason:" + "Value cannot be ingested into Snowflake column %s of type %s, Row Index:%d, reason:" + " %s", columnName, snowflakeType, rowIndex, reason)); } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index 289b8a983..567dbf127 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -210,8 +210,8 @@ private float addRow( throw new SFException( ErrorCode.MAX_ROW_SIZE_EXCEEDED, String.format( - "rowSizeInBytes=%.3f maxAllowedRowSizeInBytes=%d", - size, clientBufferParameters.getMaxAllowedRowSizeInBytes())); + "rowSizeInBytes=%.3f, maxAllowedRowSizeInBytes=%d, Row Index=%d", + size, clientBufferParameters.getMaxAllowedRowSizeInBytes(), insertRowsCurrIndex)); } out.accept(Arrays.asList(indexedRow)); diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java index 6cd7a018e..0694020e7 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java @@ -676,19 +676,19 @@ public void testTooLargeMultiByteSemiStructuredValues() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index:0, reason:" + " Variant too long: length=18874376 maxLength=16777152", () -> validateAndParseVariant("COL", m, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index:0, reason:" + " Array too large. length=18874378 maxLength=16777152", () -> validateAndParseArray("COL", m, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index:0, reason:" + " Object too large. length=18874376 maxLength=16777152", () -> validateAndParseObject("COL", m, 0)); } @@ -1011,8 +1011,8 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index: 0," - + " reason: Not a valid boolean, see" + + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index:0, reason:" + + " Not a valid boolean, see" + " https://docs.snowflake.com/en/sql-reference/data-types-logical.html#conversion-to-boolean" + " for the list of supported formats", () -> validateAndParseBoolean("COL", "abc", 0)); @@ -1027,7 +1027,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIME, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type TIME, Row Index:0, reason:" + " Not a valid time, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1043,7 +1043,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type DATE, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type DATE, Row Index:0, reason:" + " Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1060,7 +1060,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index: 0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1077,7 +1077,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index: 0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1094,7 +1094,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index: 0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1110,7 +1110,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type NUMBER, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type NUMBER, Row Index:0, reason:" + " Not a valid number", () -> validateAndParseBigDecimal("COL", "abc", 0)); @@ -1124,7 +1124,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type REAL, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type REAL, Row Index:0, reason:" + " Not a valid decimal number", () -> validateAndParseReal("COL", "abc", 0)); @@ -1138,7 +1138,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type STRING, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type STRING, Row Index:0, reason:" + " String too long: length=3 characters maxLength=2 characters", () -> validateAndParseString("COL", "abc", Optional.of(2), 0)); @@ -1152,13 +1152,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BINARY, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type BINARY, Row Index:0, reason:" + " Binary too long: length=2 maxLength=1", () -> validateAndParseBinary("COL", new byte[] {1, 2}, Optional.of(1), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BINARY, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type BINARY, Row Index:0, reason:" + " Not a valid hex string", () -> validateAndParseBinary("COL", "ghi", Optional.empty(), 0)); @@ -1173,7 +1173,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index:0, reason:" + " Not a valid JSON", () -> validateAndParseVariant("COL", "][", 0)); @@ -1188,7 +1188,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index:0, reason:" + " Not a valid JSON", () -> validateAndParseArray("COL", "][", 0)); @@ -1203,7 +1203,7 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index: 0, reason:" + + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index:0, reason:" + " Not a valid JSON", () -> validateAndParseObject("COL", "}{", 0)); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java index 63340e25a..45426283d 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java @@ -308,7 +308,7 @@ public void testRowIndexWithMultipleRowsWithError() { .equalsIgnoreCase( "The given row cannot be converted to the internal format due to invalid value:" + " Value cannot be ingested into Snowflake column COLCHAR of type STRING, Row" - + " Index: 1, reason: String too long: length=22 characters maxLength=11" + + " Index:1, reason: String too long: length=22 characters maxLength=11" + " characters")); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java index 62f2efdce..11be1d601 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java @@ -572,8 +572,8 @@ public void testInsertTooLargeRow() { .collect(Collectors.toList()); String expectedMessage = - "The given row exceeds the maximum allowed row size rowSizeInBytes=67109128.000" - + " maxAllowedRowSizeInBytes=67108864"; + "The given row exceeds the maximum allowed row size rowSizeInBytes=67109128.000," + + " maxAllowedRowSizeInBytes=67108864, Row Index=0"; Map row = new HashMap<>(); schema.forEach(x -> row.put(x.getName(), byteArrayOneMb)); From 22fe14828aa14c62f4a824fa7927db0847bfea40 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 27 Sep 2023 05:04:45 +0000 Subject: [PATCH 28/29] update message --- .../streaming/internal/AbstractRowBuffer.java | 15 +++-- .../internal/DataValidationUtil.java | 21 +++---- .../streaming/internal/ParquetRowBuffer.java | 2 +- .../internal/DataValidationUtilTest.java | 60 +++++++++---------- .../streaming/internal/RowBufferTest.java | 4 +- .../SnowflakeStreamingIngestChannelTest.java | 4 +- 6 files changed, 52 insertions(+), 54 deletions(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java index a3ed90c54..dde639d00 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java @@ -236,10 +236,11 @@ public float getSize() { * * @param row the input row * @param error the insert error that we return to the customer + * @param rowIndex the index of the current row in the input batch * @return the set of input column names */ Set verifyInputColumns( - Map row, InsertValidationResponse.InsertError error) { + Map row, InsertValidationResponse.InsertError error, int rowIndex) { // Map of unquoted column name -> original column name Map inputColNamesMap = row.keySet().stream() @@ -260,7 +261,8 @@ Set verifyInputColumns( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, "Extra columns: " + extraCols, - "Columns not present in the table shouldn't be specified. Row Index:%d"); + String.format( + "Columns not present in the table shouldn't be specified, rowIndex:%d", rowIndex)); } // Check for missing columns in the row @@ -278,7 +280,8 @@ Set verifyInputColumns( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, "Missing columns: " + missingCols, - "Values for all non-nullable columns must be specified. Row Index:%d"); + String.format( + "Values for all non-nullable columns must be specified, rowIndex:%d", rowIndex)); } return inputColNamesMap.keySet(); @@ -304,12 +307,12 @@ public InsertValidationResponse insertRows( this.channelState.updateInsertStats(System.currentTimeMillis(), this.bufferedRowCount); if (onErrorOption == OpenChannelRequest.OnErrorOption.CONTINUE) { // Used to map incoming row(nth row) to InsertError(for nth row) in response - long rowIndex = 0; + int rowIndex = 0; for (Map row : rows) { InsertValidationResponse.InsertError error = new InsertValidationResponse.InsertError(row, rowIndex); try { - Set inputColumnNames = verifyInputColumns(row, error); + Set inputColumnNames = verifyInputColumns(row, error, rowIndex); rowsSizeInBytes += addRow(row, this.bufferedRowCount, this.statsMap, inputColumnNames, rowIndex); this.bufferedRowCount++; @@ -333,7 +336,7 @@ public InsertValidationResponse insertRows( float tempRowsSizeInBytes = 0F; int tempRowCount = 0; for (Map row : rows) { - Set inputColumnNames = verifyInputColumns(row, null); + Set inputColumnNames = verifyInputColumns(row, null, tempRowCount); tempRowsSizeInBytes += addTempRow(row, tempRowCount, this.tempStatsMap, inputColumnNames, tempRowCount); checkBatchSizeEnforcedMaximum(tempRowsSizeInBytes); diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index 7982ca6a4..4514b98b7 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -24,11 +24,7 @@ import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.time.format.DateTimeParseException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; +import java.util.*; import java.util.function.Supplier; import net.snowflake.client.jdbc.internal.google.common.collect.Sets; import net.snowflake.client.jdbc.internal.snowflake.common.core.SnowflakeDateTimeFormat; @@ -458,8 +454,8 @@ static TimestampWrapper validateAndParseTimestamp( throw new SFException( ErrorCode.INVALID_VALUE_ROW, String.format( - "Timestamp out of representable inclusive range of years between 1 and 9999, Row" - + " Index:%d", + "Timestamp out of representable inclusive range of years between 1 and 9999," + + " rowIndex:%d", insertRowIndex)); } return new TimestampWrapper(offsetDateTime, scale); @@ -592,8 +588,8 @@ static int validateAndParseDate(String columnName, Object input, long insertRowI throw new SFException( ErrorCode.INVALID_VALUE_ROW, String.format( - "Date out of representable inclusive range of years between -9999 and 9999, Row" - + " Index:%d", + "Date out of representable inclusive range of years between -9999 and 9999," + + " rowIndex:%d", insertRowIndex)); } @@ -820,7 +816,7 @@ static void checkValueInRange( throw new SFException( ErrorCode.INVALID_FORMAT_ROW, String.format( - "Number out of representable exclusive range of (-1e%s..1e%s), Row Index:%d", + "Number out of representable exclusive range of (-1e%s..1e%s), rowIndex:%d", precision - scale, precision - scale, insertRowIndex)); } } @@ -864,8 +860,7 @@ private static SFException typeNotAllowedException( return new SFException( ErrorCode.INVALID_FORMAT_ROW, String.format( - "Object of type %s cannot be ingested into Snowflake column %s of type %s, Row" - + " Index:%d", + "Object of type %s cannot be ingested into Snowflake column %s of type %s, rowIndex:%d", javaType.getName(), columnName, snowflakeType, insertRowIndex), String.format( String.format("Allowed Java types: %s", String.join(", ", allowedJavaTypes)))); @@ -888,7 +883,7 @@ private static SFException valueFormatNotAllowedException( return new SFException( ErrorCode.INVALID_VALUE_ROW, String.format( - "Value cannot be ingested into Snowflake column %s of type %s, Row Index:%d, reason:" + "Value cannot be ingested into Snowflake column %s of type %s, rowIndex:%d, reason:" + " %s", columnName, snowflakeType, rowIndex, reason)); } diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java index 567dbf127..75966eb35 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetRowBuffer.java @@ -210,7 +210,7 @@ private float addRow( throw new SFException( ErrorCode.MAX_ROW_SIZE_EXCEEDED, String.format( - "rowSizeInBytes=%.3f, maxAllowedRowSizeInBytes=%d, Row Index=%d", + "rowSizeInBytes:%.3f, maxAllowedRowSizeInBytes:%d, rowIndex:%d", size, clientBufferParameters.getMaxAllowedRowSizeInBytes(), insertRowsCurrIndex)); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java index 0694020e7..86706fcf2 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/DataValidationUtilTest.java @@ -676,19 +676,19 @@ public void testTooLargeMultiByteSemiStructuredValues() { expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type VARIANT, rowIndex:0, reason:" + " Variant too long: length=18874376 maxLength=16777152", () -> validateAndParseVariant("COL", m, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type ARRAY, rowIndex:0, reason:" + " Array too large. length=18874378 maxLength=16777152", () -> validateAndParseArray("COL", m, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type OBJECT, rowIndex:0, reason:" + " Object too large. length=18874376 maxLength=16777152", () -> validateAndParseObject("COL", m, 0)); } @@ -1005,13 +1005,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type BOOLEAN, rowIndex:0. Allowed" + " Java types: boolean, Number, String", () -> validateAndParseBoolean("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BOOLEAN, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type BOOLEAN, rowIndex:0, reason:" + " Not a valid boolean, see" + " https://docs.snowflake.com/en/sql-reference/data-types-logical.html#conversion-to-boolean" + " for the list of supported formats", @@ -1021,13 +1021,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type TIME, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type TIME, rowIndex:0. Allowed" + " Java types: String, LocalTime, OffsetTime", () -> validateAndParseTime("COL", new Object(), 10, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIME, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type TIME, rowIndex:0, reason:" + " Not a valid time, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1037,13 +1037,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type DATE, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type DATE, rowIndex:0. Allowed" + " Java types: String, LocalDate, LocalDateTime, ZonedDateTime, OffsetDateTime", () -> validateAndParseDate("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type DATE, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type DATE, rowIndex:0, reason:" + " Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1053,14 +1053,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0." + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0." + " Allowed Java types: String, LocalDate, LocalDateTime, ZonedDateTime," + " OffsetDateTime", () -> validateAndParseTimestamp("COL", new Object(), 3, UTC, true, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1070,14 +1070,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0." + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0." + " Allowed Java types: String, LocalDate, LocalDateTime, ZonedDateTime," + " OffsetDateTime", () -> validateAndParseTimestamp("COL", new Object(), 3, UTC, false, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1087,14 +1087,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0." + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0." + " Allowed Java types: String, LocalDate, LocalDateTime, ZonedDateTime," + " OffsetDateTime", () -> validateAndParseTimestamp("COL", new Object(), 3, UTC, false, 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type TIMESTAMP, Row Index:0," + + " cannot be ingested into Snowflake column COL of type TIMESTAMP, rowIndex:0," + " reason: Not a valid value, see" + " https://docs.snowflake.com/en/user-guide/data-load-snowpipe-streaming-overview for" + " the list of supported formats", @@ -1104,13 +1104,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type NUMBER, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type NUMBER, rowIndex:0. Allowed" + " Java types: int, long, byte, short, float, double, BigDecimal, BigInteger, String", () -> validateAndParseBigDecimal("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type NUMBER, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type NUMBER, rowIndex:0, reason:" + " Not a valid number", () -> validateAndParseBigDecimal("COL", "abc", 0)); @@ -1118,13 +1118,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type REAL, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type REAL, rowIndex:0. Allowed" + " Java types: Number, String", () -> validateAndParseReal("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type REAL, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type REAL, rowIndex:0, reason:" + " Not a valid decimal number", () -> validateAndParseReal("COL", "abc", 0)); @@ -1132,13 +1132,13 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type STRING, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type STRING, rowIndex:0. Allowed" + " Java types: String, Number, boolean, char", () -> validateAndParseString("COL", new Object(), Optional.empty(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type STRING, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type STRING, rowIndex:0, reason:" + " String too long: length=3 characters maxLength=2 characters", () -> validateAndParseString("COL", "abc", Optional.of(2), 0)); @@ -1146,19 +1146,19 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type BINARY, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0. Allowed" + " Java types: byte[], String", () -> validateAndParseBinary("COL", new Object(), Optional.empty(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BINARY, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0, reason:" + " Binary too long: length=2 maxLength=1", () -> validateAndParseBinary("COL", new byte[] {1, 2}, Optional.of(1), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type BINARY, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0, reason:" + " Not a valid hex string", () -> validateAndParseBinary("COL", "ghi", Optional.empty(), 0)); @@ -1166,14 +1166,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type VARIANT, rowIndex:0. Allowed" + " Java types: String, Primitive data types and their arrays, java.time.*, List," + " Map, T[]", () -> validateAndParseVariant("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type VARIANT, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type VARIANT, rowIndex:0, reason:" + " Not a valid JSON", () -> validateAndParseVariant("COL", "][", 0)); @@ -1181,14 +1181,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type ARRAY, rowIndex:0. Allowed" + " Java types: String, Primitive data types and their arrays, java.time.*, List," + " Map, T[]", () -> validateAndParseArray("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type ARRAY, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type ARRAY, rowIndex:0, reason:" + " Not a valid JSON", () -> validateAndParseArray("COL", "][", 0)); @@ -1196,14 +1196,14 @@ public void testExceptionMessages() { expectErrorCodeAndMessage( ErrorCode.INVALID_FORMAT_ROW, "The given row cannot be converted to the internal format: Object of type java.lang.Object" - + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index:0. Allowed" + + " cannot be ingested into Snowflake column COL of type OBJECT, rowIndex:0. Allowed" + " Java types: String, Primitive data types and their arrays, java.time.*, List," + " Map, T[]", () -> validateAndParseObject("COL", new Object(), 0)); expectErrorCodeAndMessage( ErrorCode.INVALID_VALUE_ROW, "The given row cannot be converted to the internal format due to invalid value: Value" - + " cannot be ingested into Snowflake column COL of type OBJECT, Row Index:0, reason:" + + " cannot be ingested into Snowflake column COL of type OBJECT, rowIndex:0, reason:" + " Not a valid JSON", () -> validateAndParseObject("COL", "}{", 0)); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java index 45426283d..8d71d9a44 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/RowBufferTest.java @@ -307,8 +307,8 @@ public void testRowIndexWithMultipleRowsWithError() { .getMessage() .equalsIgnoreCase( "The given row cannot be converted to the internal format due to invalid value:" - + " Value cannot be ingested into Snowflake column COLCHAR of type STRING, Row" - + " Index:1, reason: String too long: length=22 characters maxLength=11" + + " Value cannot be ingested into Snowflake column COLCHAR of type STRING," + + " rowIndex:1, reason: String too long: length=22 characters maxLength=11" + " characters")); } diff --git a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java index ad09c0a69..8fbf67264 100644 --- a/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java +++ b/src/test/java/net/snowflake/ingest/streaming/internal/SnowflakeStreamingIngestChannelTest.java @@ -588,8 +588,8 @@ public void testInsertTooLargeRow() { .collect(Collectors.toList()); String expectedMessage = - "The given row exceeds the maximum allowed row size rowSizeInBytes=67109128.000," - + " maxAllowedRowSizeInBytes=67108864, Row Index=0"; + "The given row exceeds the maximum allowed row size rowSizeInBytes:67109128.000," + + " maxAllowedRowSizeInBytes:67108864, rowIndex:0"; Map row = new HashMap<>(); schema.forEach(x -> row.put(x.getName(), byteArrayOneMb)); From f0dd91dbcbcfc0bbd6ea64412149a4bcfedabc36 Mon Sep 17 00:00:00 2001 From: Toby Zhang Date: Wed, 27 Sep 2023 05:55:18 +0000 Subject: [PATCH 29/29] remove star import --- .../ingest/streaming/internal/DataValidationUtil.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java index 4514b98b7..a1831f829 100644 --- a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java +++ b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java @@ -24,7 +24,11 @@ import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.time.format.DateTimeParseException; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.function.Supplier; import net.snowflake.client.jdbc.internal.google.common.collect.Sets; import net.snowflake.client.jdbc.internal.snowflake.common.core.SnowflakeDateTimeFormat;