Skip to content

Commit

Permalink
feat: Tool for converting Record Files to Block Stream (#389)
Browse files Browse the repository at this point in the history
Signed-off-by: jasperpotts <[email protected]>
Co-authored-by: jasperpotts <[email protected]>
  • Loading branch information
jasperpotts and jasperpotts authored Dec 20, 2024
1 parent 3ad4efc commit ac58d0b
Show file tree
Hide file tree
Showing 26 changed files with 2,620 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ gradle-app.setting
.env

server/data/
/tools/data/

# manual test files
server/src/test/resources/test_output/
Expand Down
29 changes: 25 additions & 4 deletions buildSrc/src/main/kotlin/com.hedera.block.jpms-modules.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,15 @@ jvmDependencyConflicts.patch {
module("com.google.protobuf:protobuf-java-util") {
annotationLibraries.forEach { removeDependency(it) }
}
module("com.google.cloud:google-cloud-storage") {
annotationLibraries.forEach { removeDependency(it) }
}
module("com.google.api.grpc:proto-google-cloud-monitoring-v3") {
annotationLibraries.forEach { removeDependency(it) }
}
module("com.google.cloud:google-cloud-monitoring") {
annotationLibraries.forEach { removeDependency(it) }
}
module("io.prometheus:simpleclient") {
removeDependency("io.prometheus:simpleclient_tracer_otel")
removeDependency("io.prometheus:simpleclient_tracer_otel_agent")
Expand Down Expand Up @@ -124,7 +133,10 @@ extraJavaModuleInfo {
exportAllPackages()
mergeJar("javax.annotation:javax.annotation-api")
}
module("com.google.errorprone:error_prone_annotations", "com.google.errorprone.annotations")
module("com.google.errorprone:error_prone_annotations", "com.google.errorprone.annotations") {
exportAllPackages()
patchRealModule()
}
module("com.google.j2objc:j2objc-annotations", "com.google.j2objc.annotations")
module("com.google.protobuf:protobuf-java", "com.google.protobuf") {
exportAllPackages()
Expand All @@ -142,12 +154,18 @@ extraJavaModuleInfo {
module("io.perfmark:perfmark-api", "io.perfmark")
module("javax.inject:javax.inject", "javax.inject")

module("commons-codec:commons-codec", "org.apache.commons.codec")
module("commons-codec:commons-codec", "org.apache.commons.codec") {
exportAllPackages()
patchRealModule()
}
module("org.apache.commons:commons-math3", "org.apache.commons.math3")
module("org.apache.commons:commons-collections4", "org.apache.commons.collections4")
module("com.esaulpaugh:headlong", "headlong")

module("org.checkerframework:checker-qual", "org.checkerframework.checker.qual")
module("org.checkerframework:checker-qual", "org.checkerframework.checker.qual") {
exportAllPackages()
patchRealModule()
}
module("net.i2p.crypto:eddsa", "net.i2p.crypto.eddsa")
module("org.jetbrains:annotations", "org.jetbrains.annotations")
module("org.antlr:antlr4-runtime", "org.antlr.antlr4.runtime")
Expand All @@ -167,7 +185,10 @@ extraJavaModuleInfo {
requireAllDefinedDependencies()
requires("jdk.httpserver")
}

module("com.google.j2objc:j2objc-annotations", "com.google.j2objc.annotations") {
exportAllPackages()
patchRealModule()
}
// Annotation processing only
module("com.google.auto.service:auto-service-annotations", "com.google.auto.service")
module("com.google.auto.service:auto-service", "com.google.auto.service.processor")
Expand Down
2 changes: 1 addition & 1 deletion settings.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dependencyResolutionManagement {
val protobufVersion = "4.28.2"
val helidonVersion = "4.1.1"
val grpcIoVersion = "1.65.1"
var pbjVersion = "0.9.11"
val pbjVersion = "0.9.11"

// Compile time dependencies
version("io.helidon.webserver.http2", helidonVersion)
Expand Down
2 changes: 1 addition & 1 deletion stream/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ tasks.withType<JavaCompile>().configureEach {
tasks.cloneHederaProtobufs {
// uncomment below to use a specific tag
// tag = "v0.53.0" or a specific commit like "0047255"
tag = "1033f10"
tag = "eab8b58e30336512bcf387c803e6fc86b6ebe010"

// uncomment below to use a specific branch
// branch = "main"
Expand Down
16 changes: 16 additions & 0 deletions tool.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
# run gradle jar build and send output to /dev/null
./gradlew -q tool:shadowJar > /dev/null
# check if last command failed and exit if so
if [ $? -ne 0 ]; then
echo "Build failed"
exit 1
fi
# change to the tools directory
pushd tools > /dev/null
# find the jar name in the build/libs directory
JAR=$(find build/libs -name 'tools-*-all.jar')
# run the command line tool built jar file forwarding all arguments
java -jar $JAR "$@"
# change back to the original directory
popd > /dev/null
136 changes: 129 additions & 7 deletions tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,37 @@
## Table of Contents

1. [Overview](#overview)
1. [Subcommands](#subcommands)
2. [Running from command line](#running-from-command-line)
3. [Subcommands](#subcommands)
1. [The `json` Subcommand](#the-json-subcommand)
1. [The `info` Subcommand](#the-info-subcommand)
1. [Running from command line](#running-from-command-line)
2. [The `info` Subcommand](#the-info-subcommand)

## Overview

This subproject provides command line tools for working with block stream files and maybe other things in the future. It
uses [picocli](https://picocli.info) to provide a command line interface which makes it easy to extend and add new
subcommands or options.

## Running from command line

Refer to the [Quickstart](docs/quickstart.md) for a quick guide on how to run the tools CLI.

## Subcommands

The following subcommands are available:
- `json` - Converts a binary block stream to JSON
- `info` - Prints info for block files
- `record2block` - Converts a historical record stream files into blocks
- `fetchRecordsCsv` - Download mirror node record table CSV dump from GCP bucket
- `extractBlockTimes` - Extract block times from mirror node records csv file
- `validateBlockTimes` - Validates a block times file as produced by `extractBlockTimes`
- `addNewerBlockTimes` - Extends the block times file with newer block times

### The `json` Subcommand

Converts a binary block stream to JSON

`Usage: subcommands json [-t] [-ms=<minSizeMb>] [<files>...]`
`Usage: json [-t] [-ms=<minSizeMb>] [<files>...]`

**Options:**

Expand All @@ -42,7 +51,7 @@ transactions human-readable.

Prints info for block files

`Usage: subcommands info [-c] [-ms=<minSizeMb>] [-o=<outputFile>] [<files>...]`
`Usage: info [-c] [-ms=<minSizeMb>] [-o=<outputFile>] [<files>...]`

**Options:**

Expand All @@ -58,6 +67,119 @@ Prints info for block files
- `<files>...`
- The block files or directories of block files to print info for

## Running from command line
### The `record2block` Subcommand

Refer to the [Quickstart](docs/quickstart.md) for a quick guide on how to run the tools CLI.
Converts a historical record stream files into blocks. This depends on the `block_times.bin` file being present. It can
be created by running the other commands `fetchRecordsCsv`, `extractBlockTimes` and `addNewerBlockTimes` in that order.
It can also be validated by running the `validateBlockTimes` command.

This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
for more info.

`Usage: record2block [-s 0] [-e 100] [-j] [-c] [--min-node-account-id=3] [--max-node-account-id=34] [-d <dataDir>] [--block-times=<blockTimesFile>] `

**Options:**

- `-s <blockNumber>` or `--start-block=<blockNumber>`
- The first block number to process
- Default: 0
- `-e <blockNumber>` or `--end-block=<blockNumber>`
- The last block number to process
- Default: 3001
- `-j` or `--json`
- also output blocks as json, useful for debugging and testing
- Default: false
- `-c` or `--cache-enabled`
- Use local cache for downloaded content, saves cloud costs and bandwidth when testing
- Default: false
- `--min-node-account-id=<minNodeAccountId>`
- the account id of the first node in the network
- Default: 3
- `--max-node-account-id=<maxNodeAccountId>`
- the account id of the last node in the network
- Default: 34
- `--data-dir=<dataDir>`
- the data directory for output and temporary files
- Default: "data"
- `--block-times=<blockTimesFile>`
- Path to the block times ".bin" file.
- Default: "data/block_times.bin"

### The `fetchRecordsCsv` Subcommand

Download mirror node record table CSV dump from GCP bucket. The records table on mirror node has a row for every block
mirror node knows about. The CSV file is huge 11GB+ in November 2024. This data is important for records to blocks
conversion as we have to make sure the block number assigned for a record file matches what mirror node says as the
source of truth.

This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
for more info.

`Usage: fetchRecordsCsv [--record-csv=<recordFilePath>]`

**Options:**

- `--record-csv=<recordFilePath>`
- Path to the record CSV file.
- Default: "data/record.csv"

### The `extractBlockTimes` Subcommand

Extract block times from mirror node records csv file. Reads <recordFilePath> and produces <blockTimesFile>. We need to
convert the mirror node records CSV because it is huge 11GB+ compressed and too large to fit into RAM, and we can not
random access easily. The only part of the data needed for the records to blocks conversion is the block times. The
block time being the record file time for a given block. The record file consensus time is used as the file name of the
record file in the bucket.

The block times file is a binary file of longs, each long is the number of nanoseconds for that block after first block
time. So first block = 0, second about 5 seconds later etc. The index is the block number, so block 0 is first long,
block 1 is second block and so on. This file can then be memory mapped and used as fast lookup for block
number(array offset) into block time, i.e. record file name.

`Usage: extractBlockTimes [--record-csv=<recordFilePath>] [--block-times=<blockTimesFile>]`

**Options:**

- `--record-csv=<recordFilePath>`
- Path to the record CSV file.
- Default: "data/record.csv"
- `--block-times=<blockTimesFile>`
- Path to the block times ".bin" file.
- Default: "data/block_times.bin"


### The `addNewerBlockTimes` Subcommand

Extends the block times file with newer block times. This is done by listing the record files in the bucket and
counting them for block numbers. It processes day by day, listing one day then appending block times to the block times
file. Then at the end of each day it checks the block number it has computed still matches mirror node by using the
mirror node REST API. This whole process can take a long time if the mirror node CSV dump is old.

This command depends on reading data from public requester pays Google Cloud buckets. To do that it needs you to be
authenticated with the Google Cloud SDK. You can authenticate with `gcloud auth application-default login` or
`gcloud auth login` see [Google Documentation](https://cloud.google.com/storage/docs/reference/libraries#authentication)
for more info.

`Usage: addNewerBlockTimes [-c] [--min-node-account-id=3] [--max-node-account-id=34] [-d <dataDir>] [--block-times=<blockTimesFile>]`

**Options:**

- `-c` or `--cache-enabled`
- Use local cache for downloaded content, saves cloud costs and bandwidth when testing
- Default: true
- `--min-node-account-id=<minNodeAccountId>`
- the account id of the first node in the network
- Default: 3
- `--max-node-account-id=<maxNodeAccountId>`
- the account id of the last node in the network
- Default: 34
- `--data-dir=<dataDir>`
- the data directory for output and temporary files
- Default: "data"
- `--block-times=<blockTimesFile>`
- Path to the block times ".bin" file.
- Default: "data/block_times.bin"
69 changes: 66 additions & 3 deletions tools/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
/*
* Copyright (C) 2024 Hedera Hashgraph, LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import com.github.jengelman.gradle.plugins.shadow.internal.DefaultDependencyFilter
import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
import org.gradlex.javamodule.dependencies.tasks.ModuleDirectivesScopeCheck

/*
* Copyright (C) 2022-2024 Hedera Hashgraph, LLC
*
Expand All @@ -17,19 +37,62 @@
plugins {
id("application")
id("com.hedera.block.tools")
id("com.gradleup.shadow") version "8.3.5"
}

description = "Hedera Block Stream Tools"

application {
mainModule = "com.hedera.block.tools"
mainClass = "com.hedera.block.tools.BlockStreamTool"
application { mainClass = "com.hedera.block.tools.BlockStreamTool" }

// Generate Manifest with Main-Class and Implementation-Title
tasks.withType<Jar>().configureEach {
manifest {
attributes(
"Main-Class" to application.mainClass.get(),
"Implementation-Title" to project.name,
"Implementation-Version" to project.version
)
}
}

// Allow non-module Jar
extraJavaModuleInfo {
failOnMissingModuleInfo = false
failOnAutomaticModules = false
}

// Disable module directives scope check as we are not using modules
tasks.withType<ModuleDirectivesScopeCheck>().configureEach { enabled = false }

mainModuleInfo {
runtimeOnly("com.swirlds.config.impl")
runtimeOnly("org.apache.logging.log4j.slf4j2.impl")
runtimeOnly("io.grpc.netty.shaded")
}

testModuleInfo { requiresStatic("com.github.spotbugs.annotations") }

dependencies {
implementation(platform("com.google.cloud:libraries-bom:26.49.0"))
implementation("com.google.cloud:google-cloud-storage")
implementation("com.github.luben:zstd-jni:1.5.6-6")
implementation("info.picocli:picocli:4.7.6")
// depend on peer streams gradle module to get access to protobuf generated classes
implementation(project(":stream"))
}

tasks.withType<ShadowJar>().configureEach {
group = "shadow"

// There is an issue in the shadow plugin that it automatically accesses the
// files in 'runtimeClasspath' while Gradle is building the task graph.
// See: https://github.com/GradleUp/shadow/issues/882
dependencyFilter = NoResolveDependencyFilter()
}

// Disable dependency resolution as it conflicts with shadow plugin
class NoResolveDependencyFilter : DefaultDependencyFilter(project) {
override fun resolve(configuration: FileCollection): FileCollection {
return configuration
}
}
7 changes: 7 additions & 0 deletions tools/docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
> recommended to use the project qualifier (i.e. `:tools:`) for
> both simplicity and clarity.
### Easy way for Unix based OSs
There is a command line script for building and running tool, which is located in the root of the repository. It has the
nice extra feature of giving you colored console output.
```
./tool.sh info --help
```

### Build the Tools

> **NOTE:** if you have not done so already, it is
Expand Down
Loading

0 comments on commit ac58d0b

Please sign in to comment.