merge master

snowflakedb · Jul 13, 2024 · f2025e8 · f2025e8
2 parents 8bddd2c + f5669d1
commit f2025e8
Show file tree

Hide file tree

Showing 16 changed files with 523 additions and 60 deletions.
diff --git a/pom.xml b/pom.xml
@@ -358,6 +358,18 @@
         <version>3.7.7</version>
         <scope>test</scope>
       </dependency>
+      <dependency>
+        <groupId>org.openjdk.jmh</groupId>
+        <artifactId>jmh-core</artifactId>
+        <version>1.34</version>
+        <scope>test</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.openjdk.jmh</groupId>
+        <artifactId>jmh-generator-annprocess</artifactId>
+        <version>1.34</version>
+        <scope>test</scope>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
@@ -470,6 +482,12 @@
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-common</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>javax.annotation</groupId>
+          <artifactId>javax.annotation-api</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.parquet</groupId>
@@ -527,6 +545,16 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-generator-annprocess</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.powermock</groupId>
       <artifactId>powermock-api-mockito2</artifactId>
@@ -723,8 +751,10 @@
               <ignoreNonCompile>true</ignoreNonCompile>
               <ignoredDependencies>
                 <!-- We defined these as direct dependencies (as opposed to just declaring it in dependencyManagement)
-                                                                                                to workaround https://issues.apache.org/jira/browse/MNG-7982. Now the dependency analyzer complains that
-                                                                                                the dependency is unused, so we ignore it here-->
+                                to workaround https://issues.apache.org/jira/browse/MNG-7982. Now the dependency
+                                analyzer complains that
+                                the dependency is unused, so we ignore it here-->
+                &gt;&gt;&gt;&gt;&gt;&gt;&gt; master
                 <ignoredDependency>org.apache.commons:commons-compress</ignoredDependency>
                 <ignoredDependency>org.apache.commons:commons-configuration2</ignoredDependency>
               </ignoredDependencies>
@@ -819,9 +849,16 @@
         <configuration>
           <errorRemedy>failFast</errorRemedy>
           <!--
-                                                                                                     The list of allowed licenses. If you see the build failing due to "There are some forbidden licenses used, please
-                                                                                                    check your dependencies", verify the conditions of the license and add the reference to it here.
-                                                                                                  -->
+          <<<<<<< HEAD
+                                                                                                               The list of allowed licenses. If you see the build failing due to "There are some forbidden licenses used, please
+                                                                                                              check your dependencies", verify the conditions of the license and add the reference to it here.
+                                                                                                            -->
+          =======
+                    The list of allowed licenses. If you see the build failing due to "There are some forbidden licenses
+                    used, please
+                    check your dependencies", verify the conditions of the license and add the reference to it here.
+                    --&gt;
+                    &gt;&gt;&gt;&gt;&gt;&gt;&gt; master
           <includedLicenses>
             <includedLicense>Apache License 2.0</includedLicense>
             <includedLicense>BSD 2-Clause License</includedLicense>
@@ -830,7 +867,6 @@
             <includedLicense>EDL 1.0</includedLicense>
             <includedLicense>The Go license</includedLicense>
             <includedLicense>Bouncy Castle Licence</includedLicense>
-            <includedLicense>CDDL + GPLv2 with classpath exception</includedLicense>
           </includedLicenses>
           <excludedScopes>test,provided,system</excludedScopes>
           <failOnBlacklist>true</failOnBlacklist>
@@ -1134,9 +1170,9 @@
             </executions>
           </plugin>
           <!--
-                                                                                                              Plugin executes license processing Python script, which copies third party license files into the directory
-                                                                                                              target/generated-licenses-info/META-INF/third-party-licenses, which is then included in the shaded JAR.
-                                                                                                              -->
+                                                                                                                                                                                    Plugin executes license processing Python script, which copies third party license files into the directory
+                                                                                                                                                                                    target/generated-licenses-info/META-INF/third-party-licenses, which is then included in the shaded JAR.
+                                                                                                                                                                                    -->
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>exec-maven-plugin</artifactId>

diff --git a/scripts/process_licenses.py b/scripts/process_licenses.py
@@ -31,7 +31,6 @@
 MIT_LICENSE = "The MIT License"
 GO_LICENSE = "The Go license"
 BOUNCY_CASTLE_LICENSE = "Bouncy Castle Licence <https://www.bouncycastle.org/licence.html>"
-CDDL_GPLv2 = "CDDL + GPLv2 with classpath exception"
 
 # The SDK does not need to include licenses of dependencies, which aren't shaded
 IGNORED_DEPENDENCIES = {"net.snowflake:snowflake-jdbc", "org.slf4j:slf4j-api"}
@@ -62,7 +61,6 @@
     "org.bouncycastle:bcpkix-jdk18on": BOUNCY_CASTLE_LICENSE,
     "org.bouncycastle:bcutil-jdk18on": BOUNCY_CASTLE_LICENSE,
     "org.bouncycastle:bcprov-jdk18on": BOUNCY_CASTLE_LICENSE,
-    "javax.annotation:javax.annotation-api": CDDL_GPLv2
 }
 
 
@@ -135,7 +133,8 @@ def main():
                     missing_licenses_str += f"{dependency_lookup_key}: {license_name}\n"
                 else:
                     raise Exception(
-                        f"The dependency {dependency_lookup_key} does not ship a license file, but neither is it not defined in ADDITIONAL_LICENSES_MAP")
+                        f"The dependency {dependency_lookup_key} does not ship a license file, but neither is it not "
+                        f"defined in ADDITIONAL_LICENSES_MAP")
 
     with open(Path(target_dir, "ADDITIONAL_LICENCES"), "w") as additional_licenses_handle:
         additional_licenses_handle.write(missing_licenses_str)

diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java b/src/main/java/net/snowflake/ingest/streaming/internal/AbstractRowBuffer.java
@@ -16,7 +16,6 @@
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 import java.util.function.Consumer;
-import java.util.stream.Collectors;
 import net.snowflake.ingest.connection.TelemetryService;
 import net.snowflake.ingest.streaming.InsertValidationResponse;
 import net.snowflake.ingest.streaming.OffsetTokenVerificationFunction;
@@ -400,10 +399,10 @@ public float getSize() {
   Set<String> verifyInputColumns(
       Map<String, Object> row, InsertValidationResponse.InsertError error, int rowIndex) {
     // Map of unquoted column name -> original column name
-    Map<String, String> inputColNamesMap =
-        row.keySet().stream()
-            .collect(Collectors.toMap(LiteralQuoteUtils::unquoteColumnName, value -> value));
-
+    Set<String> originalKeys = row.keySet();
+    Map<String, String> inputColNamesMap = new HashMap<>();
+    originalKeys.forEach(
+        key -> inputColNamesMap.put(LiteralQuoteUtils.unquoteColumnName(key), key));
     // Check for extra columns in the row
     List<String> extraCols = new ArrayList<>();
     for (String columnName : inputColNamesMap.keySet()) {

diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java b/src/main/java/net/snowflake/ingest/streaming/internal/DataValidationUtil.java
@@ -86,6 +86,18 @@ class DataValidationUtil {
     objectMapper.registerModule(module);
   }
 
+  // Caching the powers of 10 that are used for checking the range of numbers because computing them
+  // on-demand is expensive.
+  private static final BigDecimal[] POWER_10 = makePower10Table();
+
+  private static BigDecimal[] makePower10Table() {
+    BigDecimal[] power10 = new BigDecimal[Power10.sb16Size];
+    for (int i = 0; i < Power10.sb16Size; i++) {
+      power10[i] = new BigDecimal(Power10.sb16Table[i]);
+    }
+    return power10;
+  }
+
   /**
    * Validates and parses input as JSON. All types in the object tree must be valid variant types,
    * see {@link DataValidationUtil#isAllowedSemiStructuredType}.
@@ -823,7 +835,11 @@ static int validateAndParseBoolean(String columnName, Object input, long insertR
 
   static void checkValueInRange(
       BigDecimal bigDecimalValue, int scale, int precision, final long insertRowIndex) {
-    if (bigDecimalValue.abs().compareTo(BigDecimal.TEN.pow(precision - scale)) >= 0) {
+    BigDecimal comparand =
+        (precision >= scale) && (precision - scale) < POWER_10.length
+            ? POWER_10[precision - scale]
+            : BigDecimal.TEN.pow(precision - scale);
+    if (bigDecimalValue.abs().compareTo(comparand) >= 0) {
       throw new SFException(
           ErrorCode.INVALID_FORMAT_ROW,
           String.format(

diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java b/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java
@@ -122,6 +122,7 @@ List<List<ChannelData<T>>> getData() {
 
   // blob encoding version
   private final Constants.BdecVersion bdecVersion;
+  private volatile int numProcessors = Runtime.getRuntime().availableProcessors();
 
   /**
    * Constructor for TESTING that takes (usually mocked) StreamingIngestStage
@@ -360,6 +361,9 @@ void distributeFlushTasks() {
     List<Pair<BlobData<T>, CompletableFuture<BlobMetadata>>> blobs = new ArrayList<>();
     List<ChannelData<T>> leftoverChannelsDataPerTable = new ArrayList<>();
 
+    // The API states that the number of available processors reported can change and therefore, we
+    // should poll it occasionally.
+    numProcessors = Runtime.getRuntime().availableProcessors();
     while (itr.hasNext() || !leftoverChannelsDataPerTable.isEmpty()) {
       List<List<ChannelData<T>>> blobData = new ArrayList<>();
       float totalBufferSizeInBytes = 0F;
@@ -704,8 +708,7 @@ String getClientPrefix() {
    */
   boolean throttleDueToQueuedFlushTasks() {
     ThreadPoolExecutor buildAndUpload = (ThreadPoolExecutor) this.buildUploadWorkers;
-    boolean throttleOnQueuedTasks =
-        buildAndUpload.getQueue().size() > Runtime.getRuntime().availableProcessors();
+    boolean throttleOnQueuedTasks = buildAndUpload.getQueue().size() > numProcessors;
     if (throttleOnQueuedTasks) {
       logger.logWarn(
           "Throttled due too many queue flush tasks (probably because of slow uploading speed),"

diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/MemoryInfoProvider.java b/src/main/java/net/snowflake/ingest/streaming/internal/MemoryInfoProvider.java
@@ -9,9 +9,6 @@ public interface MemoryInfoProvider {
   /** @return Max memory the JVM can allocate */
   long getMaxMemory();
 
-  /** @return Total allocated JVM memory so far */
-  long getTotalMemory();
-
   /** @return Free JVM memory */
   long getFreeMemory();
 }
diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/MemoryInfoProviderFromRuntime.java b/src/main/java/net/snowflake/ingest/streaming/internal/MemoryInfoProviderFromRuntime.java
@@ -4,20 +4,51 @@
 
 package net.snowflake.ingest.streaming.internal;
 
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
 /** Reads memory information from JVM runtime */
 public class MemoryInfoProviderFromRuntime implements MemoryInfoProvider {
-  @Override
-  public long getMaxMemory() {
-    return Runtime.getRuntime().maxMemory();
+  private final long maxMemory;
+  private volatile long totalFreeMemory;
+  private final ScheduledExecutorService executorService;
+  private static final long FREE_MEMORY_UPDATE_INTERVAL_MS = 100;
+  private static final MemoryInfoProviderFromRuntime INSTANCE =
+      new MemoryInfoProviderFromRuntime(FREE_MEMORY_UPDATE_INTERVAL_MS);
+
+  private MemoryInfoProviderFromRuntime(long freeMemoryUpdateIntervalMs) {
+    maxMemory = Runtime.getRuntime().maxMemory();
+    totalFreeMemory =
+        Runtime.getRuntime().freeMemory() + (maxMemory - Runtime.getRuntime().totalMemory());
+    executorService =
+        new ScheduledThreadPoolExecutor(
+            1,
+            r -> {
+              Thread th = new Thread(r, "MemoryInfoProviderFromRuntime");
+              th.setDaemon(true);
+              return th;
+            });
+    executorService.scheduleAtFixedRate(
+        this::updateFreeMemory, 0, freeMemoryUpdateIntervalMs, TimeUnit.MILLISECONDS);
+  }
+
+  private void updateFreeMemory() {
+    totalFreeMemory =
+        Runtime.getRuntime().freeMemory() + (maxMemory - Runtime.getRuntime().totalMemory());
+  }
+
+  public static MemoryInfoProviderFromRuntime getInstance() {
+    return INSTANCE;
   }
 
   @Override
-  public long getTotalMemory() {
-    return Runtime.getRuntime().totalMemory();
+  public long getMaxMemory() {
+    return maxMemory;
   }
 
   @Override
   public long getFreeMemory() {
-    return Runtime.getRuntime().freeMemory();
+    return totalFreeMemory;
   }
 }
diff --git a/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java b/src/main/java/net/snowflake/ingest/streaming/internal/ParquetFlusher.java
@@ -9,6 +9,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 import net.snowflake.ingest.utils.Constants;
 import net.snowflake.ingest.utils.ErrorCode;
 import net.snowflake.ingest.utils.Logging;
@@ -124,6 +125,12 @@ private SerializationResult serializeFromParquetWriteBuffers(
 
     if (mergedChannelWriter != null) {
       mergedChannelWriter.close();
+      this.verifyRowCounts(
+          "serializeFromParquetWriteBuffers",
+          mergedChannelWriter,
+          rowCount,
+          channelsDataPerTable,
+          -1);
     }
     return new SerializationResult(
         channelsMetadataList,
@@ -216,6 +223,9 @@ private SerializationResult serializeFromJavaObjects(
     rows.forEach(parquetWriter::writeRow);
     parquetWriter.close();
 
+    this.verifyRowCounts(
+        "serializeFromJavaObjects", parquetWriter, rowCount, channelsDataPerTable, rows.size());
+
     return new SerializationResult(
         channelsMetadataList,
         columnEpStatsMapCombined,
@@ -224,4 +234,71 @@ private SerializationResult serializeFromJavaObjects(
         mergedData,
         chunkMinMaxInsertTimeInMs);
   }
+
+  /**
+   * Validates that rows count in metadata matches the row count in Parquet footer and the row count
+   * written by the parquet writer
+   *
+   * @param serializationType Serialization type, used for logging purposes only
+   * @param writer Parquet writer writing the data
+   * @param channelsDataPerTable Channel data
+   * @param totalMetadataRowCount Row count calculated during metadata collection
+   * @param javaSerializationTotalRowCount Total row count when java object serialization is used.
+   *     Used only for logging purposes if there is a mismatch.
+   */
+  private void verifyRowCounts(
+      String serializationType,
+      BdecParquetWriter writer,
+      long totalMetadataRowCount,
+      List<ChannelData<ParquetChunkData>> channelsDataPerTable,
+      long javaSerializationTotalRowCount) {
+    long parquetTotalRowsWritten = writer.getRowsWritten();
+
+    List<Long> parquetFooterRowsPerBlock = writer.getRowCountsFromFooter();
+    long parquetTotalRowsInFooter = 0;
+    for (long perBlockCount : parquetFooterRowsPerBlock) {
+      parquetTotalRowsInFooter += perBlockCount;
+    }
+
+    if (parquetTotalRowsInFooter != totalMetadataRowCount
+        || parquetTotalRowsWritten != totalMetadataRowCount) {
+
+      final String perChannelRowCountsInMetadata =
+          channelsDataPerTable.stream()
+              .map(x -> String.valueOf(x.getRowCount()))
+              .collect(Collectors.joining(","));
+
+      final String channelNames =
+          channelsDataPerTable.stream()
+              .map(x -> String.valueOf(x.getChannelContext().getName()))
+              .collect(Collectors.joining(","));
+
+      final String perBlockRowCountsInFooter =
+          parquetFooterRowsPerBlock.stream().map(String::valueOf).collect(Collectors.joining(","));
+
+      final long channelsCountInMetadata = channelsDataPerTable.size();
+
+      throw new SFException(
+          ErrorCode.INTERNAL_ERROR,
+          String.format(
+              "[%s]The number of rows in Parquet does not match the number of rows in metadata. "
+                  + "parquetTotalRowsInFooter=%d "
+                  + "totalMetadataRowCount=%d "
+                  + "parquetTotalRowsWritten=%d "
+                  + "perChannelRowCountsInMetadata=%s "
+                  + "perBlockRowCountsInFooter=%s "
+                  + "channelsCountInMetadata=%d "
+                  + "countOfSerializedJavaObjects=%d "
+                  + "channelNames=%s",
+              serializationType,
+              parquetTotalRowsInFooter,
+              totalMetadataRowCount,
+              parquetTotalRowsWritten,
+              perChannelRowCountsInMetadata,
+              perBlockRowCountsInFooter,
+              channelsCountInMetadata,
+              javaSerializationTotalRowCount,
+              channelNames));
+    }
+  }
 }