feat: Tool for converting Record Files to Block Stream (#389)

Signed-off-by: jasperpotts <[email protected]> Co-authored-by: jasperpotts <[email protected]>
hashgraph · Dec 20, 2024 · ac58d0b · ac58d0b
1 parent 3ad4efc
commit ac58d0b
Show file tree

Hide file tree

Showing 26 changed files with 2,620 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -55,6 +55,7 @@ gradle-app.setting
 .env
 
 server/data/
+/tools/data/
 
 # manual test files
 server/src/test/resources/test_output/

diff --git a/buildSrc/src/main/kotlin/com.hedera.block.jpms-modules.gradle.kts b/buildSrc/src/main/kotlin/com.hedera.block.jpms-modules.gradle.kts
@@ -68,6 +68,15 @@ jvmDependencyConflicts.patch {
     module("com.google.protobuf:protobuf-java-util") {
         annotationLibraries.forEach { removeDependency(it) }
     }
+    module("com.google.cloud:google-cloud-storage") {
+        annotationLibraries.forEach { removeDependency(it) }
+    }
+    module("com.google.api.grpc:proto-google-cloud-monitoring-v3") {
+        annotationLibraries.forEach { removeDependency(it) }
+    }
+    module("com.google.cloud:google-cloud-monitoring") {
+        annotationLibraries.forEach { removeDependency(it) }
+    }
     module("io.prometheus:simpleclient") {
         removeDependency("io.prometheus:simpleclient_tracer_otel")
         removeDependency("io.prometheus:simpleclient_tracer_otel_agent")
@@ -124,7 +133,10 @@ extraJavaModuleInfo {
         exportAllPackages()
         mergeJar("javax.annotation:javax.annotation-api")
     }
-    module("com.google.errorprone:error_prone_annotations", "com.google.errorprone.annotations")
+    module("com.google.errorprone:error_prone_annotations", "com.google.errorprone.annotations") {
+        exportAllPackages()
+        patchRealModule()
+    }
     module("com.google.j2objc:j2objc-annotations", "com.google.j2objc.annotations")
     module("com.google.protobuf:protobuf-java", "com.google.protobuf") {
         exportAllPackages()
@@ -142,12 +154,18 @@ extraJavaModuleInfo {
     module("io.perfmark:perfmark-api", "io.perfmark")
     module("javax.inject:javax.inject", "javax.inject")
 
-    module("commons-codec:commons-codec", "org.apache.commons.codec")
+    module("commons-codec:commons-codec", "org.apache.commons.codec") {
+        exportAllPackages()
+        patchRealModule()
+    }
     module("org.apache.commons:commons-math3", "org.apache.commons.math3")
     module("org.apache.commons:commons-collections4", "org.apache.commons.collections4")
     module("com.esaulpaugh:headlong", "headlong")
 
-    module("org.checkerframework:checker-qual", "org.checkerframework.checker.qual")
+    module("org.checkerframework:checker-qual", "org.checkerframework.checker.qual") {
+        exportAllPackages()
+        patchRealModule()
+    }
     module("net.i2p.crypto:eddsa", "net.i2p.crypto.eddsa")
     module("org.jetbrains:annotations", "org.jetbrains.annotations")
     module("org.antlr:antlr4-runtime", "org.antlr.antlr4.runtime")
@@ -167,7 +185,10 @@ extraJavaModuleInfo {
         requireAllDefinedDependencies()
         requires("jdk.httpserver")
     }
-
+    module("com.google.j2objc:j2objc-annotations", "com.google.j2objc.annotations") {
+        exportAllPackages()
+        patchRealModule()
+    }
     // Annotation processing only
     module("com.google.auto.service:auto-service-annotations", "com.google.auto.service")
     module("com.google.auto.service:auto-service", "com.google.auto.service.processor")

diff --git a/settings.gradle.kts b/settings.gradle.kts
@@ -49,7 +49,7 @@ dependencyResolutionManagement {
             val protobufVersion = "4.28.2"
             val helidonVersion = "4.1.1"
             val grpcIoVersion = "1.65.1"
-            var pbjVersion = "0.9.11"
+            val pbjVersion = "0.9.11"
 
             // Compile time dependencies
             version("io.helidon.webserver.http2", helidonVersion)

diff --git a/stream/build.gradle.kts b/stream/build.gradle.kts
@@ -33,7 +33,7 @@ tasks.withType<JavaCompile>().configureEach {
 tasks.cloneHederaProtobufs {
     // uncomment below to use a specific tag
     // tag = "v0.53.0" or a specific commit like "0047255"
-    tag = "1033f10"
+    tag = "eab8b58e30336512bcf387c803e6fc86b6ebe010"
 
     // uncomment below to use a specific branch
     // branch = "main"

diff --git a/tool.sh b/tool.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# run gradle jar build and send output to /dev/null
+./gradlew -q tool:shadowJar > /dev/null
+# check if last command failed and exit if so
+if [ $? -ne 0 ]; then
+  echo "Build failed"
+  exit 1
+fi
+# change to the tools directory
+pushd tools > /dev/null
+# find the jar name in the build/libs directory
+JAR=$(find build/libs -name 'tools-*-all.jar')
+# run the command line tool built jar file forwarding all arguments
+java -jar $JAR "$@"
+# change back to the original directory
+popd > /dev/null
diff --git a/tools/README.md b/tools/README.md
@@ -3,28 +3,37 @@
 ## Table of Contents
 
 1. [Overview](#overview)
-1. [Subcommands](#subcommands)
+2. [Running from command line](#running-from-command-line)
+3. [Subcommands](#subcommands)
    1. [The `json` Subcommand](#the-json-subcommand)
-   1. [The `info` Subcommand](#the-info-subcommand)
-1. [Running from command line](#running-from-command-line)
+   2. [The `info` Subcommand](#the-info-subcommand)
 
 ## Overview
 
 This subproject provides command line tools for working with block stream files and maybe other things in the future. It
 uses [picocli](https://picocli.info) to provide a command line interface which makes it easy to extend and add new
 subcommands or options.
 
+## Running from command line
+
+Refer to the [Quickstart](docs/quickstart.md) for a quick guide on how to run the tools CLI.
+
 ## Subcommands
 
 The following subcommands are available:
 - `json` - Converts a binary block stream to JSON
 - `info` - Prints info for block files
+- `record2block` - Converts a historical record stream files into blocks
+- `fetchRecordsCsv` - Download mirror node record table CSV dump from GCP bucket
+- `extractBlockTimes` - Extract block times from mirror node records csv file
+- `validateBlockTimes` - Validates a block times file as produced by `extractBlockTimes`
+- `addNewerBlockTimes` - Extends the block times file with newer block times
 
 ### The `json` Subcommand
 
 Converts a binary block stream to JSON
 
-`Usage: subcommands json [-t] [-ms=<minSizeMb>] [<files>...]`
+`Usage: json [-t] [-ms=<minSizeMb>] [<files>...]`
 
 **Options:**
 
@@ -42,7 +51,7 @@ transactions human-readable.
 
 Prints info for block files
 
-`Usage: subcommands info [-c] [-ms=<minSizeMb>] [-o=<outputFile>] [<files>...]`
+`Usage: info [-c] [-ms=<minSizeMb>] [-o=<outputFile>] [<files>...]`
 
 **Options:**
 
@@ -58,6 +67,119 @@ Prints info for block files
 - `<files>...`
    - The block files or directories of block files to print info for
 
-## Running from command line
+### The `record2block` Subcommand
 
-Refer to the [Quickstart](docs/quickstart.md) for a quick guide on how to run the tools CLI.
+Converts a historical record stream files into blocks. This depends on the `block_times.bin` file being present. It can
+be created by running the other commands `fetchRecordsCsv`, `extractBlockTimes` and `addNewerBlockTimes` in that order.
+It can also be validated by running the `validateBlockTimes` command.
+
+This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
+authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
+`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
+for more info.
+
+`Usage: record2block [-s 0] [-e 100] [-j] [-c] [--min-node-account-id=3] [--max-node-account-id=34] [-d <dataDir>] [--block-times=<blockTimesFile>] `
+
+**Options:**
+
+- `-s <blockNumber>` or `--start-block=<blockNumber>`
+   - The first block number to process
+   - Default: 0
+- `-e <blockNumber>` or `--end-block=<blockNumber>`
+   - The last block number to process
+   - Default: 3001
+- `-j` or `--json`
+   - also output blocks as json, useful for debugging and testing
+   - Default: false
+- `-c` or `--cache-enabled`
+   - Use local cache for downloaded content, saves cloud costs and bandwidth when testing
+   - Default: false
+- `--min-node-account-id=<minNodeAccountId>`
+   - the account id of the first node in the network
+   - Default: 3
+- `--max-node-account-id=<maxNodeAccountId>`
+   - the account id of the last node in the network
+   - Default: 34
+- `--data-dir=<dataDir>`
+   - the data directory for output and temporary files
+   - Default: "data"
+- `--block-times=<blockTimesFile>`
+   - Path to the block times ".bin" file.
+   - Default: "data/block_times.bin"
+
+### The `fetchRecordsCsv` Subcommand
+
+Download mirror node record table CSV dump from GCP bucket. The records table on mirror node has a row for every block
+mirror node knows about. The CSV file is huge 11GB+ in November 2024. This data is important for records to blocks
+conversion as we have to make sure the block number assigned for a record file matches what mirror node says as the
+source of truth.
+
+This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
+authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
+`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
+for more info.
+
+`Usage: fetchRecordsCsv [--record-csv=<recordFilePath>]`
+
+**Options:**
+
+- `--record-csv=<recordFilePath>`
+   - Path to the record CSV file.
+   - Default: "data/record.csv"
+
+### The `extractBlockTimes` Subcommand
+
+Extract block times from mirror node records csv file. Reads <recordFilePath> and produces <blockTimesFile>. We need to
+convert the mirror node records CSV because it is huge 11GB+ compressed and too large to fit into RAM, and we can not
+random access easily. The only part of the data needed for the records to blocks conversion is the block times. The
+block time being the record file time for a given block. The record file consensus time is used as the file name of the
+record file in the bucket.
+
+The block times file is a binary file of longs, each long is the number of nanoseconds for that block after first block
+time. So first block = 0, second about 5 seconds later etc. The index is the block number, so block 0 is first long,
+block 1 is second block and so on. This file can then be memory mapped and used as fast lookup for block
+number(array offset) into block time, i.e. record file name.
+
+`Usage: extractBlockTimes [--record-csv=<recordFilePath>] [--block-times=<blockTimesFile>]`
+
+**Options:**
+
+- `--record-csv=<recordFilePath>`
+   - Path to the record CSV file.
+   - Default: "data/record.csv"
+- `--block-times=<blockTimesFile>`
+   - Path to the block times ".bin" file.
+   - Default: "data/block_times.bin"
+
+
+### The `addNewerBlockTimes` Subcommand
+
+Extends the block times file with newer block times. This is done by listing the record files in the bucket and
+counting them for block numbers. It processes day by day, listing one day then appending block times to the block times
+file. Then at the end of each day it checks the block number it has computed still matches mirror node by using the
+mirror node REST API. This whole process can take a long time if the mirror node CSV dump is old.
+
+This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
+authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
+`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
+for more info.
+
+`Usage: addNewerBlockTimes [-c]  [--min-node-account-id=3] [--max-node-account-id=34] [-d <dataDir>] [--block-times=<blockTimesFile>]`
+
+**Options:**
+
+- `-c` or `--cache-enabled`
+  - Use local cache for downloaded content, saves cloud costs and bandwidth when testing
+  - Default: true
+- `--min-node-account-id=<minNodeAccountId>`
+  - the account id of the first node in the network
+  - Default: 3
+- `--max-node-account-id=<maxNodeAccountId>`
+  - the account id of the last node in the network
+  - Default: 34
+- `--data-dir=<dataDir>`
+  - the data directory for output and temporary files
+  - Default: "data"
+- `--block-times=<blockTimesFile>`
+  - Path to the block times ".bin" file.
+  - Default: "data/block_times.bin"
diff --git a/tools/build.gradle.kts b/tools/build.gradle.kts
@@ -1,3 +1,23 @@
+/*
+ * Copyright (C) 2024 Hedera Hashgraph, LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.github.jengelman.gradle.plugins.shadow.internal.DefaultDependencyFilter
+import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
+import org.gradlex.javamodule.dependencies.tasks.ModuleDirectivesScopeCheck
+
 /*
  * Copyright (C) 2022-2024 Hedera Hashgraph, LLC
  *
@@ -17,19 +37,62 @@
 plugins {
     id("application")
     id("com.hedera.block.tools")
+    id("com.gradleup.shadow") version "8.3.5"
 }
 
 description = "Hedera Block Stream Tools"
 
-application {
-    mainModule = "com.hedera.block.tools"
-    mainClass = "com.hedera.block.tools.BlockStreamTool"
+application { mainClass = "com.hedera.block.tools.BlockStreamTool" }
+
+// Generate Manifest with Main-Class and Implementation-Title
+tasks.withType<Jar>().configureEach {
+    manifest {
+        attributes(
+            "Main-Class" to application.mainClass.get(),
+            "Implementation-Title" to project.name,
+            "Implementation-Version" to project.version
+        )
+    }
 }
 
+// Allow non-module Jar
+extraJavaModuleInfo {
+    failOnMissingModuleInfo = false
+    failOnAutomaticModules = false
+}
+
+// Disable module directives scope check as we are not using modules
+tasks.withType<ModuleDirectivesScopeCheck>().configureEach { enabled = false }
+
 mainModuleInfo {
     runtimeOnly("com.swirlds.config.impl")
     runtimeOnly("org.apache.logging.log4j.slf4j2.impl")
     runtimeOnly("io.grpc.netty.shaded")
 }
 
 testModuleInfo { requiresStatic("com.github.spotbugs.annotations") }
+
+dependencies {
+    implementation(platform("com.google.cloud:libraries-bom:26.49.0"))
+    implementation("com.google.cloud:google-cloud-storage")
+    implementation("com.github.luben:zstd-jni:1.5.6-6")
+    implementation("info.picocli:picocli:4.7.6")
+    // depend on peer streams gradle module to get access to protobuf generated classes
+    implementation(project(":stream"))
+}
+
+tasks.withType<ShadowJar>().configureEach {
+    group = "shadow"
+
+    // There is an issue in the shadow plugin that it automatically accesses the
+    // files in 'runtimeClasspath' while Gradle is building the task graph.
+    // See: https://github.com/GradleUp/shadow/issues/882
+    dependencyFilter = NoResolveDependencyFilter()
+}
+
+// Disable dependency resolution as it conflicts with shadow plugin
+class NoResolveDependencyFilter : DefaultDependencyFilter(project) {
+    override fun resolve(configuration: FileCollection): FileCollection {
+        return configuration
+    }
+}
diff --git a/tools/docs/quickstart.md b/tools/docs/quickstart.md
@@ -17,6 +17,13 @@
 > recommended to use the project qualifier (i.e. `:tools:`) for
 > both simplicity and clarity.
 
+### Easy way for Unix based OSs
+There is a command line script for building and running tool, which is located in the root of the repository. It has the
+nice extra feature of giving you colored console output.
+```
+./tool.sh info --help
+```
+
 ### Build the Tools
 
 > **NOTE:** if you have not done so already, it is